diff --git a/.github/workflows/manual_release_stable.yaml b/.github/workflows/manual_release_stable.yaml index 5bd66a9920..d3e5ddc4e3 100644 --- a/.github/workflows/manual_release_stable.yaml +++ b/.github/workflows/manual_release_stable.yaml @@ -104,15 +104,83 @@ jobs: # TODO: add job for publish package to Conda # https://github.com/apify/crawlee-python/issues/104 + version_docs: + name: Version docs + needs: [release_prepare, changelog_update, pypi_publish] + runs-on: ubuntu-latest + outputs: + version_docs_commitish: ${{ steps.commit_versioned_docs.outputs.commit_long_sha }} + permissions: + contents: write + env: + NODE_VERSION: 22 + PYTHON_VERSION: 3.14 + + steps: + - name: Checkout repository + uses: actions/checkout@v6 + with: + token: ${{ secrets.APIFY_SERVICE_ACCOUNT_GITHUB_TOKEN }} + ref: ${{ needs.changelog_update.outputs.changelog_commitish }} + + - name: Set up Node + uses: actions/setup-node@v6 + with: + node-version: ${{ env.NODE_VERSION }} + + - name: Set up Python + uses: actions/setup-python@v6 + with: + python-version: ${{ env.PYTHON_VERSION }} + + - name: Set up uv package manager + uses: astral-sh/setup-uv@v7 + with: + python-version: ${{ env.PYTHON_VERSION }} + + - name: Install Python dependencies + run: uv run poe install-dev + + - name: Install website dependencies + run: | + cd website + corepack enable + yarn install + + - name: Snapshot the current version + run: | + cd website + VERSION="$(python -c "import tomllib, pathlib; print(tomllib.loads(pathlib.Path('../pyproject.toml').read_text())['project']['version'])")" + MAJOR_MINOR="$(echo "$VERSION" | cut -d. -f1-2)" + export MAJOR_MINOR + # Remove existing version if present (patch releases override) + rm -rf "versioned_docs/version-${MAJOR_MINOR}" + rm -rf "versioned_sidebars/version-${MAJOR_MINOR}-sidebars.json" + jq 'map(select(. != env.MAJOR_MINOR))' versions.json > tmp.json && mv tmp.json versions.json + # Copy changelog + cp ../CHANGELOG.md ../docs/changelog.md + # Build API reference and create version snapshots + bash build_api_reference.sh + npx docusaurus docs:version "$MAJOR_MINOR" + npx docusaurus api:version "$MAJOR_MINOR" + + - name: Commit and push versioned docs + id: commit_versioned_docs + uses: EndBug/add-and-commit@v10 + with: + add: "website/versioned_docs website/versioned_sidebars website/versions.json" + message: "docs: version ${{ needs.release_prepare.outputs.version_number }} docs [skip ci]" + default_author: github_actions + doc_release: name: Doc release - needs: [changelog_update, pypi_publish] + needs: [changelog_update, pypi_publish, version_docs] permissions: contents: write pages: write id-token: write uses: ./.github/workflows/_release_docs.yaml with: - # Use the ref from the changelog update to include the updated changelog. - ref: ${{ needs.changelog_update.outputs.changelog_commitish }} + # Use the version_docs commit to include both changelog and versioned docs. + ref: ${{ needs.version_docs.outputs.version_docs_commitish }} secrets: inherit diff --git a/pyproject.toml b/pyproject.toml index cdcb9d4c7f..9115564cbd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -140,6 +140,9 @@ packages = ["src/crawlee"] [tool.ruff] line-length = 120 include = ["src/**/*.py", "tests/**/*.py", "docs/**/*.py", "website/**/*.py"] +exclude = [ + "website/versioned_docs/**", +] extend-exclude = ["src/crawlee/project_template"] [tool.ruff.lint] @@ -251,6 +254,7 @@ include = ["src", "tests", "scripts", "docs", "website"] exclude = [ "src/crawlee/project_template", "docs/guides/code_examples/storage_clients/custom_storage_client_example.py", + "website/versioned_docs", ] [[tool.ty.overrides]] diff --git a/typos.toml b/typos.toml index 34e01e2774..ae1990f462 100644 --- a/typos.toml +++ b/typos.toml @@ -13,7 +13,8 @@ extend-exclude = [ "*.lock", "*.min.js", "*.min.css", - "CHANGELOG.md", + "**/CHANGELOG.md", + "**/changelog.md", ] [default.extend-identifiers] @@ -25,3 +26,4 @@ asend = "asend" # Python async generator method # Add project-specific words that should not be treated as typos mke = "mke" # Sennheiser MKE product name consts = "consts" # Common abbreviation for "constants" +certifi = "certifi" # Python package name diff --git a/website/docusaurus.config.js b/website/docusaurus.config.js index c4c16cea1a..3e936273ab 100644 --- a/website/docusaurus.config.js +++ b/website/docusaurus.config.js @@ -94,7 +94,6 @@ module.exports = { path: '../docs', sidebarPath: './sidebars.js', rehypePlugins: [externalLinkProcessor], - // disableVersioning: true, editUrl: (doc) => { return `https://github.com/apify/crawlee-python/edit/master/website/${doc.versionDocsDirPath}/${doc.docPath}`; }, @@ -118,6 +117,7 @@ module.exports = { }, sortSidebar: groupSort, routeBasePath: 'api', + python: true, pythonOptions: { pythonModulePath: path.join(__dirname, '../src/crawlee'), moduleShortcutsPath: path.join(__dirname, 'module_shortcuts.json'), @@ -172,6 +172,12 @@ module.exports = { includeVersionedDocs: false, enableLlmsFullTxt: true, relativePaths: false, + excludeRoutes: [ + '/python/api/[0-9]*/**', + '/python/api/[0-9]*', + '/python/api/next/**', + '/python/api/next', + ], }, }, ], @@ -281,6 +287,12 @@ module.exports = { label: 'Blog', position: 'left', }, + { + type: 'docsVersionDropdown', + position: 'right', + dropdownItemsBefore: [], + dropdownItemsAfter: [], + }, ], }, colorMode: { diff --git a/website/src/theme/Navbar/Content/index.js b/website/src/theme/Navbar/Content/index.js index edf9236cae..343c2e1846 100644 --- a/website/src/theme/Navbar/Content/index.js +++ b/website/src/theme/Navbar/Content/index.js @@ -1,5 +1,4 @@ import Link from '@docusaurus/Link'; -import { useLocation } from '@docusaurus/router'; import { useThemeConfig } from '@docusaurus/theme-common'; import { splitNavbarItems, @@ -38,32 +37,10 @@ function NavbarContentLayout({ left, right }) { ); } -const VERSIONS_ITEM = { - type: 'docsVersionDropdown', - position: 'left', - label: 'Versions', - dropdownItemsAfter: [ - { - href: 'https://sdk.apify.com/docs/guides/getting-started', - label: '2.2', - }, - { - href: 'https://sdk.apify.com/docs/1.3.1/guides/getting-started', - label: '1.3', - }, - ], - dropdownItemsBefore: [], -}; - export default function NavbarContent() { - const location = useLocation(); const mobileSidebar = useNavbarMobileSidebar(); const items = useNavbarItems(); - const effectiveItems = location.pathname?.endsWith('/python/') - || location.pathname?.endsWith('/python') - ? items - : [...items, VERSIONS_ITEM]; - const [leftItems, rightItems] = splitNavbarItems(effectiveItems); + const [leftItems, rightItems] = splitNavbarItems(items); const searchBarItem = items.find((item) => item.type === 'search'); return ( None:\n context.log.info(f'Request without label {context.request.url} ...')\n\n\n# Handler for category requests\n@router.handler(label='category')\nasync def category_handler(context: HttpCrawlingContext) -> None:\n context.log.info(f'Category request {context.request.url} ...')\n\n\n# Handler for product requests\n@router.handler(label='product')\nasync def product_handler(context: HttpCrawlingContext) -> None:\n context.log.info(f'Product {context.request.url} ...')\n\n\nasync def main() -> None:\n crawler = HttpCrawler(request_handler=router)\n await crawler.run()" + } + ] + }, + "decorations": [ + { + "args": "('Classes')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 12, + 4, + 6, + 9 + ], + "title": "Methods" + } + ], + "id": 3, + "module": "router", + "name": "Router", + "parsedDocstring": { + "text": "A request dispatching system that routes requests to registered handlers based on their labels.\n\nThe `Router` allows you to define and register request handlers for specific labels. When a request is received,\nthe router invokes the corresponding `request_handler` based on the request's `label`. If no matching handler\nis found, the default handler is used.\n\n### Usage\n\n```python\nfrom crawlee.crawlers import HttpCrawler, HttpCrawlingContext\nfrom crawlee.router import Router\n\nrouter = Router[HttpCrawlingContext]()\n\n\n# Handler for requests without a matching label handler\n@router.default_handler\nasync def default_handler(context: HttpCrawlingContext) -> None:\n context.log.info(f'Request without label {context.request.url} ...')\n\n\n# Handler for category requests\n@router.handler(label='category')\nasync def category_handler(context: HttpCrawlingContext) -> None:\n context.log.info(f'Category request {context.request.url} ...')\n\n\n# Handler for product requests\n@router.handler(label='product')\nasync def product_handler(context: HttpCrawlingContext) -> None:\n context.log.info(f'Product {context.request.url} ...')\n\n\nasync def main() -> None:\n crawler = HttpCrawler(request_handler=router)\n await crawler.run()" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/router.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 16 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The URL of the proxy." + } + ] + }, + "flags": {}, + "groups": [], + "id": 16, + "module": "proxy_configuration", + "name": "url", + "parsedDocstring": { + "text": "The URL of the proxy." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/proxy_configuration.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 29 + } + ], + "type": { + "name": "str", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The scheme of the proxy." + } + ] + }, + "flags": {}, + "groups": [], + "id": 17, + "module": "proxy_configuration", + "name": "scheme", + "parsedDocstring": { + "text": "The scheme of the proxy." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/proxy_configuration.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 32 + } + ], + "type": { + "name": "str", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The hostname of the proxy." + } + ] + }, + "flags": {}, + "groups": [], + "id": 18, + "module": "proxy_configuration", + "name": "hostname", + "parsedDocstring": { + "text": "The hostname of the proxy." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/proxy_configuration.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 35 + } + ], + "type": { + "name": "str", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The proxy port." + } + ] + }, + "flags": {}, + "groups": [], + "id": 19, + "module": "proxy_configuration", + "name": "port", + "parsedDocstring": { + "text": "The proxy port." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/proxy_configuration.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 38 + } + ], + "type": { + "name": "int", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The username for the proxy." + } + ] + }, + "flags": {}, + "groups": [], + "id": 20, + "module": "proxy_configuration", + "name": "username", + "parsedDocstring": { + "text": "The username for the proxy." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/proxy_configuration.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 41 + } + ], + "type": { + "name": "str", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The password for the proxy." + } + ] + }, + "flags": {}, + "groups": [], + "id": 21, + "module": "proxy_configuration", + "name": "password", + "parsedDocstring": { + "text": "The password for the proxy." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/proxy_configuration.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 44 + } + ], + "type": { + "name": "str", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The identifier of the used proxy session, if used.\nUsing the same session ID guarantees getting the same proxy URL." + } + ] + }, + "flags": {}, + "groups": [], + "id": 22, + "module": "proxy_configuration", + "name": "session_id", + "parsedDocstring": { + "text": "The identifier of the used proxy session, if used.\nUsing the same session ID guarantees getting the same proxy URL." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/proxy_configuration.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 47 + } + ], + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The tier of the proxy." + } + ] + }, + "flags": {}, + "groups": [], + "id": 23, + "module": "proxy_configuration", + "name": "proxy_tier", + "parsedDocstring": { + "text": "The tier of the proxy." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/proxy_configuration.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 51 + } + ], + "type": { + "name": "int | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Provides information about a proxy connection that is used for requests." + } + ] + }, + "decorations": [ + { + "name": "dataclass" + }, + { + "args": "('Data structures')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 18, + 21, + 19, + 23, + 17, + 22, + 16, + 20 + ], + "title": "Properties" + } + ], + "id": 15, + "module": "proxy_configuration", + "name": "ProxyInfo", + "parsedDocstring": { + "text": "Provides information about a proxy connection that is used for requests." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/proxy_configuration.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 26 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n\nExactly one of `proxy_urls`, `tiered_proxy_urls` or `new_url_function` must be specified.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 25, + "module": "proxy_configuration", + "name": "__init__", + "parsedDocstring": { + "text": "Initialize a new instance.\n\nExactly one of `proxy_urls`, `tiered_proxy_urls` or `new_url_function` must be specified.\n", + "args": { + "proxy_urls": "A list of URLs of proxies that will be rotated in a round-robin fashion", + "tiered_proxy_urls": "A list of URL tiers (where a tier is a list of proxy URLs). Crawlers will automatically\ntry to use the lowest tier (smallest index) where blocking does not happen. The proxy URLs in\nthe selected tier will be rotated in a round-robin fashion.", + "new_url_function": "A function that returns a proxy URL for a given Request. This provides full control over\nthe proxy selection mechanism." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/proxy_configuration.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 68 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n\nExactly one of `proxy_urls`, `tiered_proxy_urls` or `new_url_function` must be specified.\n" + } + ] + }, + "flags": {}, + "id": 26, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A list of URLs of proxies that will be rotated in a round-robin fashion" + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 27, + "kind": 32768, + "kindString": "Parameter", + "name": "proxy_urls", + "type": { + "name": "list[str | None] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + ], + "target": "866" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A function that returns a proxy URL for a given Request. This provides full control over\nthe proxy selection mechanism." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 28, + "kind": 32768, + "kindString": "Parameter", + "name": "new_url_function", + "type": { + "name": "_NewUrlFunction | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "_NewUrlFunction", + "target": "55" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A list of URL tiers (where a tier is a list of proxy URLs). Crawlers will automatically\ntry to use the lowest tier (smallest index) where blocking does not happen. The proxy URLs in\nthe selected tier will be rotated in a round-robin fashion." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 29, + "kind": 32768, + "kindString": "Parameter", + "name": "tiered_proxy_urls", + "type": { + "name": "list[list[str | None]] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + ], + "target": "866" + } + ], + "target": "866" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return a new ProxyInfo object based on the configured proxy rotation strategy.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 30, + "module": "proxy_configuration", + "name": "new_proxy_info", + "parsedDocstring": { + "text": "Return a new ProxyInfo object based on the configured proxy rotation strategy.\n", + "args": { + "session_id": "Session identifier. If provided, same proxy URL will be returned for\nsubsequent calls with this ID. Will be auto-generated for tiered proxies if\nnot provided.", + "request": "Request object used for proxy rotation and tier selection. Required for\ntiered proxies to track retries and adjust tier accordingly.", + "proxy_tier": "Specific proxy tier to use. If not provided, will be automatically\nselected based on configuration." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/proxy_configuration.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 114 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return a new ProxyInfo object based on the configured proxy rotation strategy.\n" + } + ] + }, + "flags": {}, + "id": 31, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "new_proxy_info", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Session identifier. If provided, same proxy URL will be returned for\nsubsequent calls with this ID. Will be auto-generated for tiered proxies if\nnot provided." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 32, + "kind": 32768, + "kindString": "Parameter", + "name": "session_id", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Request object used for proxy rotation and tier selection. Required for\ntiered proxies to track retries and adjust tier accordingly." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 33, + "kind": 32768, + "kindString": "Parameter", + "name": "request", + "type": { + "name": "Request | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Request", + "target": "398" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Specific proxy tier to use. If not provided, will be automatically\nselected based on configuration." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 34, + "kind": 32768, + "kindString": "Parameter", + "name": "proxy_tier", + "type": { + "name": "int | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "ProxyInfo | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ProxyInfo", + "target": "15" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return a proxy URL string based on the configured proxy rotation strategy.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 35, + "module": "proxy_configuration", + "name": "new_url", + "parsedDocstring": { + "text": "Return a proxy URL string based on the configured proxy rotation strategy.\n", + "args": { + "session_id": "Session identifier. If provided, same proxy URL will be returned for\nsubsequent calls with this ID. Will be auto-generated for tiered proxies if\nnot provided.", + "request": "Request object used for proxy rotation and tier selection. Required for\ntiered proxies to track retries and adjust tier accordingly.", + "proxy_tier": "Specific proxy tier to use. If not provided, will be automatically\nselected based on configuration." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/proxy_configuration.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 159 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return a proxy URL string based on the configured proxy rotation strategy.\n" + } + ] + }, + "flags": {}, + "id": 36, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "new_url", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Session identifier. If provided, same proxy URL will be returned for\nsubsequent calls with this ID. Will be auto-generated for tiered proxies if\nnot provided." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 37, + "kind": 32768, + "kindString": "Parameter", + "name": "session_id", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Request object used for proxy rotation and tier selection. Required for\ntiered proxies to track retries and adjust tier accordingly." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 38, + "kind": 32768, + "kindString": "Parameter", + "name": "request", + "type": { + "name": "Request | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Request", + "target": "398" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Specific proxy tier to use. If not provided, will be automatically\nselected based on configuration." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 39, + "kind": 32768, + "kindString": "Parameter", + "name": "proxy_tier", + "type": { + "name": "int | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Configures connection to a proxy server with the provided options.\n\nProxy servers are used to prevent target websites from blocking your crawlers based on IP address rate limits or\nblacklists. Setting proxy configuration in your crawlers automatically configures them to use the selected proxies\nfor all connections. You can get information about the currently used proxy by inspecting the {@apilink ProxyInfo}\nproperty in your crawler's page function. There, you can inspect the proxy's URL and other attributes.\n\nIf you want to use your own proxies, use the {@apilink ProxyConfigurationOptions.proxyUrls} option. Your list of\nproxy URLs will be rotated by the configuration if this option is provided." + } + ] + }, + "decorations": [ + { + "args": "('Classes')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 25, + 30, + 35 + ], + "title": "Methods" + } + ], + "id": 24, + "module": "proxy_configuration", + "name": "ProxyConfiguration", + "parsedDocstring": { + "text": "Configures connection to a proxy server with the provided options.\n\nProxy servers are used to prevent target websites from blocking your crawlers based on IP address rate limits or\nblacklists. Setting proxy configuration in your crawlers automatically configures them to use the selected proxies\nfor all connections. You can get information about the currently used proxy by inspecting the {@apilink ProxyInfo}\nproperty in your crawler's page function. There, you can inspect the proxy's URL and other attributes.\n\nIf you want to use your own proxies, use the {@apilink ProxyConfigurationOptions.proxyUrls} option. Your list of\nproxy URLs will be rotated by the configuration if this option is provided." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/proxy_configuration.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 56 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 41, + "module": "proxy_configuration", + "name": "__init__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/proxy_configuration.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 227 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 42, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 43, + "kind": 32768, + "kindString": "Parameter", + "name": "tiered_proxy_urls", + "type": { + "name": "list", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "URL" + }, + { + "type": "literal", + "value": null + } + ] + } + ], + "target": "866" + } + ], + "target": "866" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 44, + "module": "proxy_configuration", + "name": "all_urls", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/proxy_configuration.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 233 + } + ], + "type": { + "name": "Sequence", + "type": "reference", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "URL" + }, + { + "type": "literal", + "value": null + } + ] + } + ] + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 45, + "module": "proxy_configuration", + "name": "get_tier_urls", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/proxy_configuration.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 236 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 46, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "get_tier_urls", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 47, + "kind": 32768, + "kindString": "Parameter", + "name": "tier_number", + "type": { + "name": "int", + "type": "reference" + } + } + ], + "type": { + "name": "Sequence", + "type": "reference", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "URL" + }, + { + "type": "literal", + "value": null + } + ] + } + ] + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 48, + "module": "proxy_configuration", + "name": "add_error", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/proxy_configuration.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 239 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 49, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "add_error", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 50, + "kind": 32768, + "kindString": "Parameter", + "name": "domain", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 51, + "kind": 32768, + "kindString": "Parameter", + "name": "tier", + "type": { + "name": "int", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 52, + "module": "proxy_configuration", + "name": "predict_tier", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/proxy_configuration.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 242 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 53, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "predict_tier", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 54, + "kind": 32768, + "kindString": "Parameter", + "name": "domain", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "int", + "type": "reference" + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Tracks the state of currently used proxy tiers and their error frequency for individual crawled domains." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 41, + 48, + 45, + 52 + ], + "title": "Methods" + }, + { + "children": [ + 44 + ], + "title": "Properties" + } + ], + "id": 40, + "module": "proxy_configuration", + "name": "_ProxyTierTracker", + "parsedDocstring": { + "text": "Tracks the state of currently used proxy tiers and their error frequency for individual crawled domains." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/proxy_configuration.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 224 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 56, + "module": "proxy_configuration", + "name": "__call__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/proxy_configuration.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 264 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 57, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__call__", + "parameters": [ + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 58, + "kind": 32768, + "kindString": "Parameter", + "name": "session_id", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 59, + "kind": 32768, + "kindString": "Parameter", + "name": "request", + "type": { + "name": "Request | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Request", + "target": "398" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "str | None | Awaitable[str | None]", + "type": "union", + "types": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + }, + { + "type": "reference", + "name": "Awaitable", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + ] + } + ] + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 56 + ], + "title": "Methods" + } + ], + "id": 55, + "module": "proxy_configuration", + "name": "_NewUrlFunction", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/proxy_configuration.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 263 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 60, + "module": "errors", + "name": "TCrawlingContext", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/errors.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 24 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Wraps an exception thrown from an user-defined error handler." + } + ] + }, + "decorations": [ + { + "args": "('Errors')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [], + "id": 61, + "module": "errors", + "name": "UserDefinedErrorHandlerError", + "parsedDocstring": { + "text": "Wraps an exception thrown from an user-defined error handler." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/errors.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 28 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Errors of `SessionError` type will trigger a session rotation.\n\nThis error doesn't respect the `max_request_retries` option and has a separate limit of `max_session_rotations`." + } + ] + }, + "decorations": [ + { + "args": "('Errors')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [], + "id": 62, + "module": "errors", + "name": "SessionError", + "parsedDocstring": { + "text": "Errors of `SessionError` type will trigger a session rotation.\n\nThis error doesn't respect the `max_request_retries` option and has a separate limit of `max_session_rotations`." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/errors.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 33 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedBy": [ + { + "name": "ProxyError", + "target": "69", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 64, + "module": "errors", + "name": "__init__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/errors.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 44 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 65, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 66, + "kind": 32768, + "kindString": "Parameter", + "name": "service", + "type": { + "name": "type", + "type": "reference" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 67, + "kind": 32768, + "kindString": "Parameter", + "name": "new_value", + "type": { + "name": "object", + "type": "reference" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 68, + "kind": 32768, + "kindString": "Parameter", + "name": "existing_value", + "type": { + "name": "object", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Raised when attempting to reassign a service in service container that is already in use." + } + ] + }, + "decorations": [ + { + "args": "('Errors')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 64 + ], + "title": "Methods" + } + ], + "id": 63, + "module": "errors", + "name": "ServiceConflictError", + "parsedDocstring": { + "text": "Raised when attempting to reassign a service in service container that is already in use." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/errors.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 41 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Raised when a proxy is being blocked or malfunctions." + } + ] + }, + "decorations": [ + { + "args": "('Errors')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [], + "id": 69, + "module": "errors", + "name": "ProxyError", + "parsedDocstring": { + "text": "Raised when a proxy is being blocked or malfunctions." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/errors.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 52 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "SessionError", + "target": "62", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 71, + "module": "errors", + "name": "__init__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/errors.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 60 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 72, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 73, + "kind": 32768, + "kindString": "Parameter", + "name": "message", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 74, + "kind": 32768, + "kindString": "Parameter", + "name": "status_code", + "type": { + "name": "int", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "inheritedFrom": { + "name": "HttpStatusCodeError.__init__", + "target": 71, + "type": "reference" + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Raised when the response status code indicates an error." + } + ] + }, + "decorations": [ + { + "args": "('Errors')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 71 + ], + "title": "Methods" + } + ], + "id": 70, + "module": "errors", + "name": "HttpStatusCodeError", + "parsedDocstring": { + "text": "Raised when the response status code indicates an error." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/errors.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 57 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedBy": [ + { + "name": "HttpClientStatusCodeError", + "target": "75", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3615, + "module": "errors", + "name": "__init__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/errors.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 60 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 72, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 73, + "kind": 32768, + "kindString": "Parameter", + "name": "message", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 74, + "kind": 32768, + "kindString": "Parameter", + "name": "status_code", + "type": { + "name": "int", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "inheritedFrom": { + "name": "HttpStatusCodeError.__init__", + "target": 71, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "HttpStatusCodeError.__init__", + "target": 71, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Raised when the response status code indicates an client error." + } + ] + }, + "decorations": [ + { + "args": "('Errors')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 3615 + ], + "title": "Methods" + } + ], + "id": 75, + "module": "errors", + "name": "HttpClientStatusCodeError", + "parsedDocstring": { + "text": "Raised when the response status code indicates an client error." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/errors.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 67 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "HttpStatusCodeError", + "target": "70", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 77, + "module": "errors", + "name": "__init__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/errors.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 75 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 78, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 79, + "kind": 32768, + "kindString": "Parameter", + "name": "wrapped_exception", + "type": { + "name": "Exception", + "type": "reference" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 80, + "kind": 32768, + "kindString": "Parameter", + "name": "crawling_context", + "type": { + "name": "TCrawlingContext", + "type": "reference", + "target": "1" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Wraps an exception thrown from a request handler (router) and extends it with crawling context." + } + ] + }, + "decorations": [ + { + "args": "('Errors')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 77 + ], + "title": "Methods" + } + ], + "id": 76, + "module": "errors", + "name": "RequestHandlerError", + "parsedDocstring": { + "text": "Wraps an exception thrown from a request handler (router) and extends it with crawling context." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/errors.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 72 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 82, + "module": "errors", + "name": "__init__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/errors.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 88 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 83, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 84, + "kind": 32768, + "kindString": "Parameter", + "name": "wrapped_exception", + "type": { + "name": "Exception", + "type": "reference" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 85, + "kind": 32768, + "kindString": "Parameter", + "name": "crawling_context", + "type": { + "name": "BasicCrawlingContext", + "type": "reference", + "target": "309" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Wraps an exception thrown in the initialization step of a context pipeline middleware.\n\nWe may not have the complete context at this point, so only `BasicCrawlingContext` is provided." + } + ] + }, + "decorations": [ + { + "args": "('Errors')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 82 + ], + "title": "Methods" + } + ], + "id": 81, + "module": "errors", + "name": "ContextPipelineInitializationError", + "parsedDocstring": { + "text": "Wraps an exception thrown in the initialization step of a context pipeline middleware.\n\nWe may not have the complete context at this point, so only `BasicCrawlingContext` is provided." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/errors.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 82 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 87, + "module": "errors", + "name": "__init__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/errors.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 101 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 88, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 89, + "kind": 32768, + "kindString": "Parameter", + "name": "wrapped_exception", + "type": { + "name": "Exception", + "type": "reference" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 90, + "kind": 32768, + "kindString": "Parameter", + "name": "crawling_context", + "type": { + "name": "BasicCrawlingContext", + "type": "reference", + "target": "309" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Wraps an exception thrown in the finalization step of a context pipeline middleware.\n\nWe may not have the complete context at this point, so only `BasicCrawlingContext` is provided." + } + ] + }, + "decorations": [ + { + "args": "('Errors')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 87 + ], + "title": "Methods" + } + ], + "id": 86, + "module": "errors", + "name": "ContextPipelineFinalizationError", + "parsedDocstring": { + "text": "Wraps an exception thrown in the finalization step of a context pipeline middleware.\n\nWe may not have the complete context at this point, so only `BasicCrawlingContext` is provided." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/errors.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 95 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "May be thrown in the initialization phase of a middleware to signal that the request should not be processed." + } + ] + }, + "decorations": [ + { + "args": "('Errors')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [], + "id": 91, + "module": "errors", + "name": "ContextPipelineInterruptedError", + "parsedDocstring": { + "text": "May be thrown in the initialization phase of a middleware to signal that the request should not be processed." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/errors.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 108 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Raised when a request cannot be processed due to a conflict with required resources." + } + ] + }, + "decorations": [ + { + "args": "('Errors')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [], + "id": 92, + "module": "errors", + "name": "RequestCollisionError", + "parsedDocstring": { + "text": "Raised when a request cannot be processed due to a conflict with required resources." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/errors.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 113 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 94, + "module": "configuration", + "name": "model_config", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/configuration.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 30 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Timeout for the internal asynchronous operations." + } + ] + }, + "flags": {}, + "groups": [], + "id": 95, + "module": "configuration", + "name": "internal_timeout", + "parsedDocstring": { + "text": "Timeout for the internal asynchronous operations." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/configuration.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 32 + } + ], + "type": { + "name": "Annotated[timedelta | None, Field(alias='crawlee_internal_timeout')]", + "type": "union", + "types": [ + { + "type": "reference", + "name": "timedelta" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Specifies the path to the browser executable. Currently primarily for Playwright-based features. This option\nis passed directly to Playwright's `browser_type.launch` method as `executable_path` argument. For more details,\nrefer to the Playwright documentation:\nhttps://playwright.dev/docs/api/class-browsertype#browser-type-launch." + } + ] + }, + "flags": {}, + "groups": [], + "id": 96, + "module": "configuration", + "name": "default_browser_path", + "parsedDocstring": { + "text": "Specifies the path to the browser executable. Currently primarily for Playwright-based features. This option\nis passed directly to Playwright's `browser_type.launch` method as `executable_path` argument. For more details,\nrefer to the Playwright documentation:\nhttps://playwright.dev/docs/api/class-browsertype#browser-type-launch." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/configuration.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 35 + } + ], + "type": { + "name": "Annotated[ str | None, Field( validation_alias=AliasChoices( 'apify_default_browser_path', 'crawlee_default_browser_path', ) ), ]", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Disables the sandbox for the browser. Currently primarily for Playwright-based features. This option\nis passed directly to Playwright's `browser_type.launch` method as `chromium_sandbox`. For more details,\nrefer to the Playwright documentation:\nhttps://playwright.dev/docs/api/class-browsertype#browser-type-launch." + } + ] + }, + "flags": {}, + "groups": [], + "id": 97, + "module": "configuration", + "name": "disable_browser_sandbox", + "parsedDocstring": { + "text": "Disables the sandbox for the browser. Currently primarily for Playwright-based features. This option\nis passed directly to Playwright's `browser_type.launch` method as `chromium_sandbox`. For more details,\nrefer to the Playwright documentation:\nhttps://playwright.dev/docs/api/class-browsertype#browser-type-launch." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/configuration.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 50 + } + ], + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The logging level." + } + ] + }, + "flags": {}, + "groups": [], + "id": 98, + "module": "configuration", + "name": "log_level", + "parsedDocstring": { + "text": "The logging level." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/configuration.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 64 + } + ], + "type": { + "name": "Literal", + "type": "reference", + "typeArguments": [ + { + "type": "literal", + "value": "DEBUG" + }, + { + "type": "literal", + "value": "INFO" + }, + { + "type": "literal", + "value": "WARNING" + }, + { + "type": "literal", + "value": "ERROR" + }, + { + "type": "literal", + "value": "CRITICAL" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The default `Dataset` ID. This option is utilized by the storage client." + } + ] + }, + "flags": {}, + "groups": [], + "id": 99, + "module": "configuration", + "name": "default_dataset_id", + "parsedDocstring": { + "text": "The default `Dataset` ID. This option is utilized by the storage client." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/configuration.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 76 + } + ], + "type": { + "name": "str", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The default `KeyValueStore` ID. This option is utilized by the storage client." + } + ] + }, + "flags": {}, + "groups": [], + "id": 100, + "module": "configuration", + "name": "default_key_value_store_id", + "parsedDocstring": { + "text": "The default `KeyValueStore` ID. This option is utilized by the storage client." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/configuration.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 88 + } + ], + "type": { + "name": "str", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The default `RequestQueue` ID. This option is utilized by the storage client." + } + ] + }, + "flags": {}, + "groups": [], + "id": 101, + "module": "configuration", + "name": "default_request_queue_id", + "parsedDocstring": { + "text": "The default `RequestQueue` ID. This option is utilized by the storage client." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/configuration.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 100 + } + ], + "type": { + "name": "str", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Whether to purge the storage on the start. This option is utilized by the `MemoryStorageClient`." + } + ] + }, + "flags": {}, + "groups": [], + "id": 102, + "module": "configuration", + "name": "purge_on_start", + "parsedDocstring": { + "text": "Whether to purge the storage on the start. This option is utilized by the `MemoryStorageClient`." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/configuration.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 112 + } + ], + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Whether to write the storage metadata. This option is utilized by the `MemoryStorageClient`." + } + ] + }, + "flags": {}, + "groups": [], + "id": 103, + "module": "configuration", + "name": "write_metadata", + "parsedDocstring": { + "text": "Whether to write the storage metadata. This option is utilized by the `MemoryStorageClient`." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/configuration.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 123 + } + ], + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Whether to persist the storage. This option is utilized by the `MemoryStorageClient`." + } + ] + }, + "flags": {}, + "groups": [], + "id": 104, + "module": "configuration", + "name": "persist_storage", + "parsedDocstring": { + "text": "Whether to persist the storage. This option is utilized by the `MemoryStorageClient`." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/configuration.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 126 + } + ], + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Interval at which `PersistState` events are emitted. The event ensures the state persistence during\nthe crawler run. This option is utilized by the `EventManager`." + } + ] + }, + "flags": {}, + "groups": [], + "id": 105, + "module": "configuration", + "name": "persist_state_interval", + "parsedDocstring": { + "text": "Interval at which `PersistState` events are emitted. The event ensures the state persistence during\nthe crawler run. This option is utilized by the `EventManager`." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/configuration.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 137 + } + ], + "type": { + "name": "timedelta_ms", + "type": "reference", + "target": "2973" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Interval at which `SystemInfo` events are emitted. The event represents the current status of the system.\nThis option is utilized by the `LocalEventManager`." + } + ] + }, + "flags": {}, + "groups": [], + "id": 106, + "module": "configuration", + "name": "system_info_interval", + "parsedDocstring": { + "text": "Interval at which `SystemInfo` events are emitted. The event represents the current status of the system.\nThis option is utilized by the `LocalEventManager`." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/configuration.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 149 + } + ], + "type": { + "name": "timedelta_ms", + "type": "reference", + "target": "2973" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The maximum CPU usage ratio. If the CPU usage exceeds this value, the system is considered overloaded.\nThis option is used by the `Snapshotter`." + } + ] + }, + "flags": {}, + "groups": [], + "id": 107, + "module": "configuration", + "name": "max_used_cpu_ratio", + "parsedDocstring": { + "text": "The maximum CPU usage ratio. If the CPU usage exceeds this value, the system is considered overloaded.\nThis option is used by the `Snapshotter`." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/configuration.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 161 + } + ], + "type": { + "name": "float", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The maximum memory usage ratio. If the memory usage exceeds this ratio, it is considered overloaded.\nThis option is used by the `Snapshotter`." + } + ] + }, + "flags": {}, + "groups": [], + "id": 108, + "module": "configuration", + "name": "max_used_memory_ratio", + "parsedDocstring": { + "text": "The maximum memory usage ratio. If the memory usage exceeds this ratio, it is considered overloaded.\nThis option is used by the `Snapshotter`." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/configuration.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 173 + } + ], + "type": { + "name": "float", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The maximum event loop delay. If the event loop delay exceeds this value, it is considered overloaded.\nThis option is used by the `Snapshotter`." + } + ] + }, + "flags": {}, + "groups": [], + "id": 109, + "module": "configuration", + "name": "max_event_loop_delay", + "parsedDocstring": { + "text": "The maximum event loop delay. If the event loop delay exceeds this value, it is considered overloaded.\nThis option is used by the `Snapshotter`." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/configuration.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 185 + } + ], + "type": { + "name": "timedelta_ms", + "type": "reference", + "target": "2973" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The maximum number of client errors (HTTP 429) allowed before the system is considered overloaded.\nThis option is used by the `Snapshotter`." + } + ] + }, + "flags": {}, + "groups": [], + "id": 110, + "module": "configuration", + "name": "max_client_errors", + "parsedDocstring": { + "text": "The maximum number of client errors (HTTP 429) allowed before the system is considered overloaded.\nThis option is used by the `Snapshotter`." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/configuration.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 197 + } + ], + "type": { + "name": "int", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The maximum used memory in megabytes. This option is utilized by the `Snapshotter`." + } + ] + }, + "flags": {}, + "groups": [], + "id": 111, + "module": "configuration", + "name": "memory_mbytes", + "parsedDocstring": { + "text": "The maximum used memory in megabytes. This option is utilized by the `Snapshotter`." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/configuration.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 209 + } + ], + "type": { + "name": "Annotated[ int | None, Field( validation_alias=AliasChoices( 'actor_memory_mbytes', 'apify_memory_mbytes', 'crawlee_memory_mbytes', ) ), ]", + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The maximum proportion of system memory to use. If `memory_mbytes` is not provided, this ratio is used to\ncalculate the maximum memory. This option is utilized by the `Snapshotter`." + } + ] + }, + "flags": {}, + "groups": [], + "id": 112, + "module": "configuration", + "name": "available_memory_ratio", + "parsedDocstring": { + "text": "The maximum proportion of system memory to use. If `memory_mbytes` is not provided, this ratio is used to\ncalculate the maximum memory. This option is utilized by the `Snapshotter`." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/configuration.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 221 + } + ], + "type": { + "name": "float", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The path to the storage directory. This option is utilized by the `MemoryStorageClient`." + } + ] + }, + "flags": {}, + "groups": [], + "id": 113, + "module": "configuration", + "name": "storage_dir", + "parsedDocstring": { + "text": "The path to the storage directory. This option is utilized by the `MemoryStorageClient`." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/configuration.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 233 + } + ], + "type": { + "name": "str", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Whether to run the browser in headless mode. Currently primarily for Playwright-based features. This option\nis passed directly to Playwright's `browser_type.launch` method as `headless`. For more details,\nrefer to the Playwright documentation:\nhttps://playwright.dev/docs/api/class-browsertype#browser-type-launch." + } + ] + }, + "flags": {}, + "groups": [], + "id": 114, + "module": "configuration", + "name": "headless", + "parsedDocstring": { + "text": "Whether to run the browser in headless mode. Currently primarily for Playwright-based features. This option\nis passed directly to Playwright's `browser_type.launch` method as `headless`. For more details,\nrefer to the Playwright documentation:\nhttps://playwright.dev/docs/api/class-browsertype#browser-type-launch." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/configuration.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 244 + } + ], + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Retrieve the global instance of the configuration.\n\nMostly for the backwards compatibility. It is recommended to use the `service_locator.get_configuration()`\ninstead." + } + ] + }, + "decorations": [ + { + "name": "classmethod" + } + ], + "flags": {}, + "groups": [], + "id": 115, + "module": "configuration", + "name": "get_global_configuration", + "parsedDocstring": { + "text": "Retrieve the global instance of the configuration.\n\nMostly for the backwards compatibility. It is recommended to use the `service_locator.get_configuration()`\ninstead." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/configuration.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 260 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Retrieve the global instance of the configuration.\n\nMostly for the backwards compatibility. It is recommended to use the `service_locator.get_configuration()`\ninstead." + } + ] + }, + "flags": {}, + "id": 116, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "get_global_configuration", + "parameters": [], + "type": { + "name": "Self", + "type": "reference" + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Configuration settings for the Crawlee project.\n\nThis class stores common configurable parameters for Crawlee. Default values are provided for all settings,\nso typically, no adjustments are necessary. However, you may modify settings for specific use cases,\nsuch as changing the default storage directory, the default storage IDs, the timeout for internal\noperations, and more.\n\nSettings can also be configured via environment variables, prefixed with `CRAWLEE_`." + } + ] + }, + "decorations": [ + { + "args": "('Data structures')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 115 + ], + "title": "Methods" + }, + { + "children": [ + 112, + 96, + 99, + 100, + 101, + 97, + 114, + 95, + 98, + 110, + 109, + 107, + 108, + 111, + 94, + 105, + 104, + 102, + 113, + 106, + 103 + ], + "title": "Properties" + } + ], + "id": 93, + "module": "configuration", + "name": "Configuration", + "parsedDocstring": { + "text": "Configuration settings for the Crawlee project.\n\nThis class stores common configurable parameters for Crawlee. Default values are provided for all settings,\nso typically, no adjustments are necessary. However, you may modify settings for specific use cases,\nsuch as changing the default storage directory, the default storage IDs, the timeout for internal\noperations, and more.\n\nSettings can also be configured via environment variables, prefixed with `CRAWLEE_`." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/configuration.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 19 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 117, + "module": "_types", + "name": "T", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 42 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 118, + "module": "_types", + "name": "HttpMethod", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 44 + } + ], + "type": { + "name": "TypeAlias", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 119, + "module": "_types", + "name": "HttpPayload", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 46 + } + ], + "type": { + "name": "TypeAlias", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 120, + "module": "_types", + "name": "RequestTransformAction", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 48 + } + ], + "type": { + "name": "TypeAlias", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Enqueue strategy to be used for determining which links to extract and enqueue." + } + ] + }, + "flags": {}, + "groups": [], + "id": 121, + "module": "_types", + "name": "EnqueueStrategy", + "parsedDocstring": { + "text": "Enqueue strategy to be used for determining which links to extract and enqueue." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 50 + } + ], + "type": { + "name": "TypeAlias", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 122, + "module": "_types", + "name": "SkippedReason", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 53 + } + ], + "type": { + "name": "TypeAlias", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 124, + "module": "_types", + "name": "model_config", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 67 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 125, + "module": "_types", + "name": "root", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 69 + } + ], + "type": { + "name": "dict", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 126, + "module": "_types", + "name": "__getitem__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 75 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 127, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__getitem__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 128, + "kind": 32768, + "kindString": "Parameter", + "name": "key", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "str", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 129, + "module": "_types", + "name": "__setitem__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 78 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 130, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__setitem__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 131, + "kind": 32768, + "kindString": "Parameter", + "name": "key", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 132, + "kind": 32768, + "kindString": "Parameter", + "name": "value", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 133, + "module": "_types", + "name": "__delitem__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 81 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 134, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__delitem__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 135, + "kind": 32768, + "kindString": "Parameter", + "name": "key", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return a new instance of `HttpHeaders` combining this one with another one." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 136, + "module": "_types", + "name": "__or__", + "parsedDocstring": { + "text": "Return a new instance of `HttpHeaders` combining this one with another one." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 84 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return a new instance of `HttpHeaders` combining this one with another one." + } + ] + }, + "flags": {}, + "id": 137, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__or__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 138, + "kind": 32768, + "kindString": "Parameter", + "name": "other", + "type": { + "name": "HttpHeaders", + "type": "reference", + "target": "123" + } + } + ], + "type": { + "name": "HttpHeaders", + "type": "reference", + "target": "123" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Support reversed | operation (other | self)." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 139, + "module": "_types", + "name": "__ror__", + "parsedDocstring": { + "text": "Support reversed | operation (other | self)." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 89 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Support reversed | operation (other | self)." + } + ] + }, + "flags": {}, + "id": 140, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__ror__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 141, + "kind": 32768, + "kindString": "Parameter", + "name": "other", + "type": { + "name": "HttpHeaders", + "type": "reference", + "target": "123" + } + } + ], + "type": { + "name": "HttpHeaders", + "type": "reference", + "target": "123" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 142, + "module": "_types", + "name": "__iter__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 94 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 143, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__iter__", + "parameters": [], + "type": { + "name": "Iterator", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 144, + "module": "_types", + "name": "__len__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 97 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 145, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__len__", + "parameters": [], + "type": { + "name": "int", + "type": "reference" + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A dictionary-like object representing HTTP headers." + } + ] + }, + "decorations": [ + { + "args": "('Data structures')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 133, + 126, + 142, + 144, + 136, + 139, + 129 + ], + "title": "Methods" + }, + { + "children": [ + 124, + 125 + ], + "title": "Properties" + } + ], + "id": 123, + "module": "_types", + "name": "HttpHeaders", + "parsedDocstring": { + "text": "A dictionary-like object representing HTTP headers." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 64 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 147, + "module": "_types", + "name": "__init__", + "parsedDocstring": { + "text": "Initialize a new instance.\n", + "args": { + "min_concurrency": "The minimum number of tasks running in parallel. If you set this value too high\nwith respect to the available system memory and CPU, your code might run extremely slow or crash.", + "max_concurrency": "The maximum number of tasks running in parallel.", + "max_tasks_per_minute": "The maximum number of tasks per minute the pool can run. By default, this is set\nto infinity, but you can pass any positive, non-zero number.", + "desired_concurrency": "The desired number of tasks that should be running parallel on the start of the pool,\nif there is a large enough supply of them. By default, it is `min_concurrency`." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 105 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n" + } + ] + }, + "flags": {}, + "id": 148, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The minimum number of tasks running in parallel. If you set this value too high\nwith respect to the available system memory and CPU, your code might run extremely slow or crash." + } + ] + }, + "defaultValue": "1", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 149, + "kind": 32768, + "kindString": "Parameter", + "name": "min_concurrency", + "type": { + "name": "int", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The maximum number of tasks running in parallel." + } + ] + }, + "defaultValue": "200", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 150, + "kind": 32768, + "kindString": "Parameter", + "name": "max_concurrency", + "type": { + "name": "int", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The maximum number of tasks per minute the pool can run. By default, this is set\nto infinity, but you can pass any positive, non-zero number." + } + ] + }, + "defaultValue": "float('inf')", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 151, + "kind": 32768, + "kindString": "Parameter", + "name": "max_tasks_per_minute", + "type": { + "name": "float", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The desired number of tasks that should be running parallel on the start of the pool,\nif there is a large enough supply of them. By default, it is `min_concurrency`." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 152, + "kind": 32768, + "kindString": "Parameter", + "name": "desired_concurrency", + "type": { + "name": "int | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Concurrency settings for AutoscaledPool." + } + ] + }, + "decorations": [ + { + "args": "('Data structures')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 147 + ], + "title": "Methods" + } + ], + "id": 146, + "module": "_types", + "name": "ConcurrencySettings", + "parsedDocstring": { + "text": "Concurrency settings for AutoscaledPool." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 102 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 8, + "kindString": "Enumeration", + "children": [ + { + "kind": 16, + "kindString": "Enumeration Member", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 154, + "module": "_types", + "name": "DATASET", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 145 + } + ], + "type": { + "type": "literal", + "value": "'Dataset'" + } + }, + { + "kind": 16, + "kindString": "Enumeration Member", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 155, + "module": "_types", + "name": "KEY_VALUE_STORE", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 146 + } + ], + "type": { + "type": "literal", + "value": "'Key-value store'" + } + }, + { + "kind": 16, + "kindString": "Enumeration Member", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 156, + "module": "_types", + "name": "REQUEST_QUEUE", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 147 + } + ], + "type": { + "type": "literal", + "value": "'Request queue'" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Possible Crawlee storage types." + } + ] + }, + "decorations": [ + { + "args": "('Data structures')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 154, + 155, + 156 + ], + "title": "Enumeration members" + } + ], + "id": 153, + "module": "_types", + "name": "StorageTypes", + "parsedDocstring": { + "text": "Possible Crawlee storage types." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 142 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum number of requests to be enqueued." + } + ] + }, + "flags": {}, + "groups": [], + "id": 158, + "module": "_types", + "name": "limit", + "parsedDocstring": { + "text": "Maximum number of requests to be enqueued." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 153 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Base URL to be used for relative URLs." + } + ] + }, + "flags": {}, + "groups": [], + "id": 159, + "module": "_types", + "name": "base_url", + "parsedDocstring": { + "text": "Base URL to be used for relative URLs." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 156 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Enqueue strategy to be used for determining which links to extract and enqueue.\n\nOptions:\nall: Enqueue every link encountered, regardless of the target domain. Use this option to ensure that all\nlinks, including those leading to external websites, are followed.\nsame-domain: Enqueue links that share the same domain name as the current page, including any subdomains.\nThis strategy is ideal for crawling within the same top-level domain while still allowing for subdomain\nexploration.\nsame-hostname: Enqueue links only if they match the exact hostname of the current page. This is the default\nbehavior and restricts the crawl to the current hostname, excluding subdomains.\nsame-origin: Enqueue links that share the same origin as the current page. The origin is defined by the\ncombination of protocol, domain, and port, ensuring a strict scope for the crawl." + } + ] + }, + "flags": {}, + "groups": [], + "id": 160, + "module": "_types", + "name": "strategy", + "parsedDocstring": { + "text": "Enqueue strategy to be used for determining which links to extract and enqueue.\n\nOptions:\nall: Enqueue every link encountered, regardless of the target domain. Use this option to ensure that all\nlinks, including those leading to external websites, are followed.\nsame-domain: Enqueue links that share the same domain name as the current page, including any subdomains.\nThis strategy is ideal for crawling within the same top-level domain while still allowing for subdomain\nexploration.\nsame-hostname: Enqueue links only if they match the exact hostname of the current page. This is the default\nbehavior and restricts the crawl to the current hostname, excluding subdomains.\nsame-origin: Enqueue links that share the same origin as the current page. The origin is defined by the\ncombination of protocol, domain, and port, ensuring a strict scope for the crawl." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 159 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Literal['all', 'same-domain', 'same-hostname', 'same-origin']" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "List of regular expressions or globs that URLs must match to be enqueued." + } + ] + }, + "flags": {}, + "groups": [], + "id": 161, + "module": "_types", + "name": "include", + "parsedDocstring": { + "text": "List of regular expressions or globs that URLs must match to be enqueued." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 174 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "re.Pattern" + }, + { + "type": "reference", + "name": "Glob", + "target": "2982" + } + ] + } + ], + "target": "866" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "List of regular expressions or globs that URLs must not match to be enqueued." + } + ] + }, + "flags": {}, + "groups": [], + "id": 162, + "module": "_types", + "name": "exclude", + "parsedDocstring": { + "text": "List of regular expressions or globs that URLs must not match to be enqueued." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 177 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "re.Pattern" + }, + { + "type": "reference", + "name": "Glob", + "target": "2982" + } + ] + } + ], + "target": "866" + } + ] + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Keyword arguments for the `enqueue_links` methods." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 159, + 162, + 161, + 158, + 160 + ], + "title": "Properties" + } + ], + "id": 157, + "module": "_types", + "name": "EnqueueLinksKwargs", + "parsedDocstring": { + "text": "Keyword arguments for the `enqueue_links` methods." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 150 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedBy": [ + { + "name": "AddRequestsKwargs", + "target": "163", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Requests to be added to the `RequestManager`." + } + ] + }, + "flags": {}, + "groups": [], + "id": 164, + "module": "_types", + "name": "requests", + "parsedDocstring": { + "text": "Requests to be added to the `RequestManager`." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 184 + } + ], + "type": { + "name": "Sequence", + "type": "reference", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Request", + "target": "398" + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum number of requests to be enqueued." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3610, + "module": "_types", + "name": "limit", + "parsedDocstring": { + "text": "Maximum number of requests to be enqueued." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 153 + } + ], + "type": { + "name": "NotRequired[int]", + "type": "reference" + }, + "inheritedFrom": { + "name": "EnqueueLinksKwargs.limit", + "target": 158, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Base URL to be used for relative URLs." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3611, + "module": "_types", + "name": "base_url", + "parsedDocstring": { + "text": "Base URL to be used for relative URLs." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 156 + } + ], + "type": { + "name": "NotRequired[str]", + "type": "reference" + }, + "inheritedFrom": { + "name": "EnqueueLinksKwargs.base_url", + "target": 159, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Enqueue strategy to be used for determining which links to extract and enqueue.\n\nOptions:\nall: Enqueue every link encountered, regardless of the target domain. Use this option to ensure that all\nlinks, including those leading to external websites, are followed.\nsame-domain: Enqueue links that share the same domain name as the current page, including any subdomains.\nThis strategy is ideal for crawling within the same top-level domain while still allowing for subdomain\nexploration.\nsame-hostname: Enqueue links only if they match the exact hostname of the current page. This is the default\nbehavior and restricts the crawl to the current hostname, excluding subdomains.\nsame-origin: Enqueue links that share the same origin as the current page. The origin is defined by the\ncombination of protocol, domain, and port, ensuring a strict scope for the crawl." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3612, + "module": "_types", + "name": "strategy", + "parsedDocstring": { + "text": "Enqueue strategy to be used for determining which links to extract and enqueue.\n\nOptions:\nall: Enqueue every link encountered, regardless of the target domain. Use this option to ensure that all\nlinks, including those leading to external websites, are followed.\nsame-domain: Enqueue links that share the same domain name as the current page, including any subdomains.\nThis strategy is ideal for crawling within the same top-level domain while still allowing for subdomain\nexploration.\nsame-hostname: Enqueue links only if they match the exact hostname of the current page. This is the default\nbehavior and restricts the crawl to the current hostname, excluding subdomains.\nsame-origin: Enqueue links that share the same origin as the current page. The origin is defined by the\ncombination of protocol, domain, and port, ensuring a strict scope for the crawl." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 159 + } + ], + "type": { + "name": "NotRequired[EnqueueStrategy]", + "type": "reference" + }, + "inheritedFrom": { + "name": "EnqueueLinksKwargs.strategy", + "target": 160, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "List of regular expressions or globs that URLs must match to be enqueued." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3613, + "module": "_types", + "name": "include", + "parsedDocstring": { + "text": "List of regular expressions or globs that URLs must match to be enqueued." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 174 + } + ], + "type": { + "name": "NotRequired[list[re.Pattern | Glob]]", + "type": "reference" + }, + "inheritedFrom": { + "name": "EnqueueLinksKwargs.include", + "target": 161, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "List of regular expressions or globs that URLs must not match to be enqueued." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3614, + "module": "_types", + "name": "exclude", + "parsedDocstring": { + "text": "List of regular expressions or globs that URLs must not match to be enqueued." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 177 + } + ], + "type": { + "name": "NotRequired[list[re.Pattern | Glob]]", + "type": "reference" + }, + "inheritedFrom": { + "name": "EnqueueLinksKwargs.exclude", + "target": 162, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Keyword arguments for the `add_requests` methods." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 3611, + 3614, + 3613, + 3610, + 164, + 3612 + ], + "title": "Properties" + } + ], + "id": 163, + "module": "_types", + "name": "AddRequestsKwargs", + "parsedDocstring": { + "text": "Keyword arguments for the `add_requests` methods." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 181 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "EnqueueLinksKwargs", + "target": "157", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Keyword arguments for dataset's `push_data` method." + } + ] + }, + "flags": {}, + "groups": [], + "id": 165, + "module": "_types", + "name": "PushDataKwargs", + "parsedDocstring": { + "text": "Keyword arguments for dataset's `push_data` method." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 188 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedBy": [ + { + "name": "PushDataFunctionCall", + "target": "166", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 167, + "module": "_types", + "name": "data", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 193 + } + ], + "type": { + "name": "JsonSerializable", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 168, + "module": "_types", + "name": "dataset_id", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 194 + } + ], + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 169, + "module": "_types", + "name": "dataset_name", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 195 + } + ], + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 167, + 168, + 169 + ], + "title": "Properties" + } + ], + "id": 166, + "module": "_types", + "name": "PushDataFunctionCall", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 192 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "PushDataKwargs", + "target": "165", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 171, + "module": "_types", + "name": "get_value", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 210 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 172, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_value", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 173, + "kind": 32768, + "kindString": "Parameter", + "name": "key", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 174, + "kind": 32768, + "kindString": "Parameter", + "name": "default_value", + "type": { + "name": "T | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "T", + "target": "117" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "T | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "T", + "target": "117" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "flags": {}, + "id": 180, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_value", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 181, + "kind": 32768, + "kindString": "Parameter", + "name": "key", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "Any", + "type": "reference" + } + }, + { + "flags": {}, + "id": 182, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_value", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 183, + "kind": 32768, + "kindString": "Parameter", + "name": "key", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 184, + "kind": 32768, + "kindString": "Parameter", + "name": "default_value", + "type": { + "name": "T", + "type": "reference", + "target": "117" + } + } + ], + "type": { + "name": "T", + "type": "reference", + "target": "117" + } + }, + { + "flags": {}, + "id": 185, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_value", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 186, + "kind": 32768, + "kindString": "Parameter", + "name": "key", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 187, + "kind": 32768, + "kindString": "Parameter", + "name": "default_value", + "type": { + "name": "T | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "T", + "target": "117" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "T | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "T", + "target": "117" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 175, + "module": "_types", + "name": "set_value", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 212 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 176, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "set_value", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 177, + "kind": 32768, + "kindString": "Parameter", + "name": "key", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 178, + "kind": 32768, + "kindString": "Parameter", + "name": "value", + "type": { + "name": "Any", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 179, + "kind": 32768, + "kindString": "Parameter", + "name": "content_type", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The (limited) part of the `KeyValueStore` interface that should be accessible from a request handler." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 171, + 175 + ], + "title": "Methods" + } + ], + "id": 170, + "module": "_types", + "name": "KeyValueStoreInterface", + "parsedDocstring": { + "text": "The (limited) part of the `KeyValueStore` interface that should be accessible from a request handler." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 198 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 189, + "module": "_types", + "name": "content", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 222 + } + ], + "type": { + "name": "Any", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 190, + "module": "_types", + "name": "content_type", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 223 + } + ], + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "args": "()", + "name": "dataclass" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 189, + 190 + ], + "title": "Properties" + } + ], + "id": 188, + "module": "_types", + "name": "KeyValueStoreValue", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 221 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 192, + "module": "_types", + "name": "__init__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 227 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 193, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 194, + "kind": 32768, + "kindString": "Parameter", + "name": "actual_key_value_store", + "type": { + "name": "KeyValueStore", + "type": "reference", + "target": "569" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 195, + "module": "_types", + "name": "set_value", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 231 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 196, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "set_value", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 197, + "kind": 32768, + "kindString": "Parameter", + "name": "key", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 198, + "kind": 32768, + "kindString": "Parameter", + "name": "value", + "type": { + "name": "Any", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 199, + "kind": 32768, + "kindString": "Parameter", + "name": "content_type", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 200, + "module": "_types", + "name": "get_value", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 248 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 201, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_value", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 202, + "kind": 32768, + "kindString": "Parameter", + "name": "key", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 203, + "kind": 32768, + "kindString": "Parameter", + "name": "default_value", + "type": { + "name": "T | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "T", + "target": "117" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "T | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "T", + "target": "117" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "flags": {}, + "id": 204, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_value", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 205, + "kind": 32768, + "kindString": "Parameter", + "name": "key", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "Any", + "type": "reference" + } + }, + { + "flags": {}, + "id": 206, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_value", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 207, + "kind": 32768, + "kindString": "Parameter", + "name": "key", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 208, + "kind": 32768, + "kindString": "Parameter", + "name": "default_value", + "type": { + "name": "T", + "type": "reference", + "target": "117" + } + } + ], + "type": { + "name": "T", + "type": "reference", + "target": "117" + } + }, + { + "flags": {}, + "id": 209, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_value", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 210, + "kind": 32768, + "kindString": "Parameter", + "name": "key", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 211, + "kind": 32768, + "kindString": "Parameter", + "name": "default_value", + "type": { + "name": "T | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "T", + "target": "117" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "T | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "T", + "target": "117" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 192, + 200, + 195 + ], + "title": "Methods" + } + ], + "id": 191, + "module": "_types", + "name": "KeyValueStoreChangeRecords", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 226 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 213, + "module": "_types", + "name": "__init__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 258 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 214, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 215, + "kind": 32768, + "kindString": "Parameter", + "name": "key_value_store_getter", + "type": { + "name": "GetKeyValueStoreFunction", + "type": "reference", + "target": "273" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Track a call to the `add_requests` context helper." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 216, + "module": "_types", + "name": "add_requests", + "parsedDocstring": { + "text": "Track a call to the `add_requests` context helper." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 264 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Track a call to the `add_requests` context helper." + } + ] + }, + "flags": {}, + "id": 217, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "add_requests", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 218, + "kind": 32768, + "kindString": "Parameter", + "name": "requests", + "type": { + "name": "Sequence", + "type": "reference", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Request", + "target": "398" + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum number of requests to be enqueued." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 158, + "module": "_types", + "name": "limit", + "parsedDocstring": { + "text": "Maximum number of requests to be enqueued." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 153 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Base URL to be used for relative URLs." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 159, + "module": "_types", + "name": "base_url", + "parsedDocstring": { + "text": "Base URL to be used for relative URLs." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 156 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Enqueue strategy to be used for determining which links to extract and enqueue.\n\nOptions:\nall: Enqueue every link encountered, regardless of the target domain. Use this option to ensure that all\nlinks, including those leading to external websites, are followed.\nsame-domain: Enqueue links that share the same domain name as the current page, including any subdomains.\nThis strategy is ideal for crawling within the same top-level domain while still allowing for subdomain\nexploration.\nsame-hostname: Enqueue links only if they match the exact hostname of the current page. This is the default\nbehavior and restricts the crawl to the current hostname, excluding subdomains.\nsame-origin: Enqueue links that share the same origin as the current page. The origin is defined by the\ncombination of protocol, domain, and port, ensuring a strict scope for the crawl." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 160, + "module": "_types", + "name": "strategy", + "parsedDocstring": { + "text": "Enqueue strategy to be used for determining which links to extract and enqueue.\n\nOptions:\nall: Enqueue every link encountered, regardless of the target domain. Use this option to ensure that all\nlinks, including those leading to external websites, are followed.\nsame-domain: Enqueue links that share the same domain name as the current page, including any subdomains.\nThis strategy is ideal for crawling within the same top-level domain while still allowing for subdomain\nexploration.\nsame-hostname: Enqueue links only if they match the exact hostname of the current page. This is the default\nbehavior and restricts the crawl to the current hostname, excluding subdomains.\nsame-origin: Enqueue links that share the same origin as the current page. The origin is defined by the\ncombination of protocol, domain, and port, ensuring a strict scope for the crawl." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 159 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Literal['all', 'same-domain', 'same-hostname', 'same-origin']" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "List of regular expressions or globs that URLs must match to be enqueued." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 161, + "module": "_types", + "name": "include", + "parsedDocstring": { + "text": "List of regular expressions or globs that URLs must match to be enqueued." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 174 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "re.Pattern" + }, + { + "type": "reference", + "name": "Glob", + "target": "2982" + } + ] + } + ], + "target": "866" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "List of regular expressions or globs that URLs must not match to be enqueued." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 162, + "module": "_types", + "name": "exclude", + "parsedDocstring": { + "text": "List of regular expressions or globs that URLs must not match to be enqueued." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 177 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "re.Pattern" + }, + { + "type": "reference", + "name": "Glob", + "target": "2982" + } + ] + } + ], + "target": "866" + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Track a call to the `push_data` context helper." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 220, + "module": "_types", + "name": "push_data", + "parsedDocstring": { + "text": "Track a call to the `push_data` context helper." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 272 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Track a call to the `push_data` context helper." + } + ] + }, + "flags": {}, + "id": 221, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "push_data", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 222, + "kind": 32768, + "kindString": "Parameter", + "name": "data", + "type": { + "name": "JsonSerializable", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 223, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_id", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 224, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 226, + "module": "_types", + "name": "get_key_value_store", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 293 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 227, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_key_value_store", + "parameters": [ + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 228, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 229, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "KeyValueStoreInterface", + "type": "reference", + "target": "170" + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Record of calls to storage-related context helpers." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 213, + 216, + 226, + 220 + ], + "title": "Methods" + } + ], + "id": 212, + "module": "_types", + "name": "RequestHandlerRunResult", + "parsedDocstring": { + "text": "Record of calls to storage-related context helpers." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 255 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Call dunder method.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 231, + "module": "_types", + "name": "__call__", + "parsedDocstring": { + "text": "Call dunder method.\n", + "args": { + "requests": "Requests to be added to the `RequestManager`.", + "**kwargs": "Additional keyword arguments." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 315 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Call dunder method.\n" + } + ] + }, + "flags": {}, + "id": 232, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__call__", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Requests to be added to the `RequestManager`." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 233, + "kind": 32768, + "kindString": "Parameter", + "name": "requests", + "type": { + "name": "Sequence", + "type": "reference", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Request", + "target": "398" + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum number of requests to be enqueued." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 158, + "module": "_types", + "name": "limit", + "parsedDocstring": { + "text": "Maximum number of requests to be enqueued." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 153 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Base URL to be used for relative URLs." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 159, + "module": "_types", + "name": "base_url", + "parsedDocstring": { + "text": "Base URL to be used for relative URLs." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 156 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Enqueue strategy to be used for determining which links to extract and enqueue.\n\nOptions:\nall: Enqueue every link encountered, regardless of the target domain. Use this option to ensure that all\nlinks, including those leading to external websites, are followed.\nsame-domain: Enqueue links that share the same domain name as the current page, including any subdomains.\nThis strategy is ideal for crawling within the same top-level domain while still allowing for subdomain\nexploration.\nsame-hostname: Enqueue links only if they match the exact hostname of the current page. This is the default\nbehavior and restricts the crawl to the current hostname, excluding subdomains.\nsame-origin: Enqueue links that share the same origin as the current page. The origin is defined by the\ncombination of protocol, domain, and port, ensuring a strict scope for the crawl." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 160, + "module": "_types", + "name": "strategy", + "parsedDocstring": { + "text": "Enqueue strategy to be used for determining which links to extract and enqueue.\n\nOptions:\nall: Enqueue every link encountered, regardless of the target domain. Use this option to ensure that all\nlinks, including those leading to external websites, are followed.\nsame-domain: Enqueue links that share the same domain name as the current page, including any subdomains.\nThis strategy is ideal for crawling within the same top-level domain while still allowing for subdomain\nexploration.\nsame-hostname: Enqueue links only if they match the exact hostname of the current page. This is the default\nbehavior and restricts the crawl to the current hostname, excluding subdomains.\nsame-origin: Enqueue links that share the same origin as the current page. The origin is defined by the\ncombination of protocol, domain, and port, ensuring a strict scope for the crawl." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 159 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Literal['all', 'same-domain', 'same-hostname', 'same-origin']" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "List of regular expressions or globs that URLs must match to be enqueued." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 161, + "module": "_types", + "name": "include", + "parsedDocstring": { + "text": "List of regular expressions or globs that URLs must match to be enqueued." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 174 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "re.Pattern" + }, + { + "type": "reference", + "name": "Glob", + "target": "2982" + } + ] + } + ], + "target": "866" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "List of regular expressions or globs that URLs must not match to be enqueued." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 162, + "module": "_types", + "name": "exclude", + "parsedDocstring": { + "text": "List of regular expressions or globs that URLs must not match to be enqueued." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 177 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "re.Pattern" + }, + { + "type": "reference", + "name": "Glob", + "target": "2982" + } + ] + } + ], + "target": "866" + } + ] + } + } + ], + "type": { + "name": "Coroutine", + "type": "reference", + "typeArguments": [ + { + "type": "literal", + "value": null + }, + { + "type": "literal", + "value": null + }, + { + "type": "literal", + "value": null + } + ] + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Function for adding requests to the `RequestManager`, with optional filtering.\n\nIt simplifies the process of adding requests to the `RequestManager`. It automatically opens\nthe specified one and adds the provided requests." + } + ] + }, + "decorations": [ + { + "args": "('Functions')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 231 + ], + "title": "Methods" + } + ], + "id": 230, + "module": "_types", + "name": "AddRequestsFunction", + "parsedDocstring": { + "text": "Function for adding requests to the `RequestManager`, with optional filtering.\n\nIt simplifies the process of adding requests to the `RequestManager`. It automatically opens\nthe specified one and adds the provided requests." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 308 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Call enqueue links function.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 236, + "module": "_types", + "name": "__call__", + "parsedDocstring": { + "text": "Call enqueue links function.\n", + "args": { + "selector": "A selector used to find the elements containing the links. The behaviour differs based\non the crawler used:\n- `PlaywrightCrawler` supports CSS and XPath selectors.\n- `ParselCrawler` supports CSS selectors.\n- `BeautifulSoupCrawler` supports CSS selectors.", + "label": "Label for the newly created `Request` objects, used for request routing.", + "user_data": "User data to be provided to the newly created `Request` objects.", + "transform_request_function": "A function that takes `RequestOptions` and returns either:\n- Modified `RequestOptions` to update the request configuration,\n- `'skip'` to exclude the request from being enqueued,\n- `'unchanged'` to use the original request options without modification.", + "requests": "Requests to be added to the `RequestManager`.", + "**kwargs": "Additional keyword arguments." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 359 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Call enqueue links function.\n" + } + ] + }, + "flags": {}, + "id": 237, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__call__", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A selector used to find the elements containing the links. The behaviour differs based\non the crawler used:\n- `PlaywrightCrawler` supports CSS and XPath selectors.\n- `ParselCrawler` supports CSS selectors.\n- `BeautifulSoupCrawler` supports CSS selectors." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 238, + "kind": 32768, + "kindString": "Parameter", + "name": "selector", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Label for the newly created `Request` objects, used for request routing." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 239, + "kind": 32768, + "kindString": "Parameter", + "name": "label", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "User data to be provided to the newly created `Request` objects." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 240, + "kind": 32768, + "kindString": "Parameter", + "name": "user_data", + "type": { + "name": "dict[str, Any] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "dict", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Any" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A function that takes `RequestOptions` and returns either:\n- Modified `RequestOptions` to update the request configuration,\n- `'skip'` to exclude the request from being enqueued,\n- `'unchanged'` to use the original request options without modification." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 241, + "kind": 32768, + "kindString": "Parameter", + "name": "transform_request_function", + "type": { + "name": "Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Callable", + "typeArguments": [ + { + "type": "reference", + "name": "[RequestOptions]" + }, + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "RequestOptions", + "target": "384" + }, + { + "type": "reference", + "name": "Literal['skip', 'unchanged']" + } + ] + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Requests to be added to the `RequestManager`." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 242, + "kind": 32768, + "kindString": "Parameter", + "name": "requests", + "type": { + "name": "Sequence[str | Request] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Sequence", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Request", + "target": "398" + } + ] + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum number of requests to be enqueued." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 158, + "module": "_types", + "name": "limit", + "parsedDocstring": { + "text": "Maximum number of requests to be enqueued." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 153 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Base URL to be used for relative URLs." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 159, + "module": "_types", + "name": "base_url", + "parsedDocstring": { + "text": "Base URL to be used for relative URLs." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 156 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Enqueue strategy to be used for determining which links to extract and enqueue.\n\nOptions:\nall: Enqueue every link encountered, regardless of the target domain. Use this option to ensure that all\nlinks, including those leading to external websites, are followed.\nsame-domain: Enqueue links that share the same domain name as the current page, including any subdomains.\nThis strategy is ideal for crawling within the same top-level domain while still allowing for subdomain\nexploration.\nsame-hostname: Enqueue links only if they match the exact hostname of the current page. This is the default\nbehavior and restricts the crawl to the current hostname, excluding subdomains.\nsame-origin: Enqueue links that share the same origin as the current page. The origin is defined by the\ncombination of protocol, domain, and port, ensuring a strict scope for the crawl." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 160, + "module": "_types", + "name": "strategy", + "parsedDocstring": { + "text": "Enqueue strategy to be used for determining which links to extract and enqueue.\n\nOptions:\nall: Enqueue every link encountered, regardless of the target domain. Use this option to ensure that all\nlinks, including those leading to external websites, are followed.\nsame-domain: Enqueue links that share the same domain name as the current page, including any subdomains.\nThis strategy is ideal for crawling within the same top-level domain while still allowing for subdomain\nexploration.\nsame-hostname: Enqueue links only if they match the exact hostname of the current page. This is the default\nbehavior and restricts the crawl to the current hostname, excluding subdomains.\nsame-origin: Enqueue links that share the same origin as the current page. The origin is defined by the\ncombination of protocol, domain, and port, ensuring a strict scope for the crawl." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 159 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Literal['all', 'same-domain', 'same-hostname', 'same-origin']" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "List of regular expressions or globs that URLs must match to be enqueued." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 161, + "module": "_types", + "name": "include", + "parsedDocstring": { + "text": "List of regular expressions or globs that URLs must match to be enqueued." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 174 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "re.Pattern" + }, + { + "type": "reference", + "name": "Glob", + "target": "2982" + } + ] + } + ], + "target": "866" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "List of regular expressions or globs that URLs must not match to be enqueued." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 162, + "module": "_types", + "name": "exclude", + "parsedDocstring": { + "text": "List of regular expressions or globs that URLs must not match to be enqueued." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 177 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "re.Pattern" + }, + { + "type": "reference", + "name": "Glob", + "target": "2982" + } + ] + } + ], + "target": "866" + } + ] + } + } + ], + "type": { + "name": "Coroutine", + "type": "reference", + "typeArguments": [ + { + "type": "literal", + "value": null + }, + { + "type": "literal", + "value": null + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Call enqueue links function.\n" + } + ] + }, + "flags": {}, + "id": 244, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__call__", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A selector used to find the elements containing the links. The behaviour differs based\non the crawler used:\n- `PlaywrightCrawler` supports CSS and XPath selectors.\n- `ParselCrawler` supports CSS selectors.\n- `BeautifulSoupCrawler` supports CSS selectors." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 245, + "kind": 32768, + "kindString": "Parameter", + "name": "selector", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Label for the newly created `Request` objects, used for request routing." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 246, + "kind": 32768, + "kindString": "Parameter", + "name": "label", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "User data to be provided to the newly created `Request` objects." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 247, + "kind": 32768, + "kindString": "Parameter", + "name": "user_data", + "type": { + "name": "dict[str, Any] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "dict", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Any" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A function that takes `RequestOptions` and returns either:\n- Modified `RequestOptions` to update the request configuration,\n- `'skip'` to exclude the request from being enqueued,\n- `'unchanged'` to use the original request options without modification." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 248, + "kind": 32768, + "kindString": "Parameter", + "name": "transform_request_function", + "type": { + "name": "Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Callable", + "typeArguments": [ + { + "type": "reference", + "name": "[RequestOptions]" + }, + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "RequestOptions", + "target": "384" + }, + { + "type": "reference", + "name": "Literal['skip', 'unchanged']" + } + ] + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum number of requests to be enqueued." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 158, + "module": "_types", + "name": "limit", + "parsedDocstring": { + "text": "Maximum number of requests to be enqueued." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 153 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Base URL to be used for relative URLs." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 159, + "module": "_types", + "name": "base_url", + "parsedDocstring": { + "text": "Base URL to be used for relative URLs." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 156 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Enqueue strategy to be used for determining which links to extract and enqueue.\n\nOptions:\nall: Enqueue every link encountered, regardless of the target domain. Use this option to ensure that all\nlinks, including those leading to external websites, are followed.\nsame-domain: Enqueue links that share the same domain name as the current page, including any subdomains.\nThis strategy is ideal for crawling within the same top-level domain while still allowing for subdomain\nexploration.\nsame-hostname: Enqueue links only if they match the exact hostname of the current page. This is the default\nbehavior and restricts the crawl to the current hostname, excluding subdomains.\nsame-origin: Enqueue links that share the same origin as the current page. The origin is defined by the\ncombination of protocol, domain, and port, ensuring a strict scope for the crawl." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 160, + "module": "_types", + "name": "strategy", + "parsedDocstring": { + "text": "Enqueue strategy to be used for determining which links to extract and enqueue.\n\nOptions:\nall: Enqueue every link encountered, regardless of the target domain. Use this option to ensure that all\nlinks, including those leading to external websites, are followed.\nsame-domain: Enqueue links that share the same domain name as the current page, including any subdomains.\nThis strategy is ideal for crawling within the same top-level domain while still allowing for subdomain\nexploration.\nsame-hostname: Enqueue links only if they match the exact hostname of the current page. This is the default\nbehavior and restricts the crawl to the current hostname, excluding subdomains.\nsame-origin: Enqueue links that share the same origin as the current page. The origin is defined by the\ncombination of protocol, domain, and port, ensuring a strict scope for the crawl." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 159 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Literal['all', 'same-domain', 'same-hostname', 'same-origin']" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "List of regular expressions or globs that URLs must match to be enqueued." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 161, + "module": "_types", + "name": "include", + "parsedDocstring": { + "text": "List of regular expressions or globs that URLs must match to be enqueued." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 174 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "re.Pattern" + }, + { + "type": "reference", + "name": "Glob", + "target": "2982" + } + ] + } + ], + "target": "866" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "List of regular expressions or globs that URLs must not match to be enqueued." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 162, + "module": "_types", + "name": "exclude", + "parsedDocstring": { + "text": "List of regular expressions or globs that URLs must not match to be enqueued." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 177 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "re.Pattern" + }, + { + "type": "reference", + "name": "Glob", + "target": "2982" + } + ] + } + ], + "target": "866" + } + ] + } + } + ], + "type": { + "name": "Coroutine", + "type": "reference", + "typeArguments": [ + { + "type": "literal", + "value": null + }, + { + "type": "literal", + "value": null + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Call enqueue links function.\n" + } + ] + }, + "flags": {}, + "id": 250, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__call__", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Requests to be added to the `RequestManager`." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 251, + "kind": 32768, + "kindString": "Parameter", + "name": "requests", + "type": { + "name": "Sequence[str | Request] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Sequence", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Request", + "target": "398" + } + ] + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum number of requests to be enqueued." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 158, + "module": "_types", + "name": "limit", + "parsedDocstring": { + "text": "Maximum number of requests to be enqueued." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 153 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Base URL to be used for relative URLs." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 159, + "module": "_types", + "name": "base_url", + "parsedDocstring": { + "text": "Base URL to be used for relative URLs." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 156 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Enqueue strategy to be used for determining which links to extract and enqueue.\n\nOptions:\nall: Enqueue every link encountered, regardless of the target domain. Use this option to ensure that all\nlinks, including those leading to external websites, are followed.\nsame-domain: Enqueue links that share the same domain name as the current page, including any subdomains.\nThis strategy is ideal for crawling within the same top-level domain while still allowing for subdomain\nexploration.\nsame-hostname: Enqueue links only if they match the exact hostname of the current page. This is the default\nbehavior and restricts the crawl to the current hostname, excluding subdomains.\nsame-origin: Enqueue links that share the same origin as the current page. The origin is defined by the\ncombination of protocol, domain, and port, ensuring a strict scope for the crawl." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 160, + "module": "_types", + "name": "strategy", + "parsedDocstring": { + "text": "Enqueue strategy to be used for determining which links to extract and enqueue.\n\nOptions:\nall: Enqueue every link encountered, regardless of the target domain. Use this option to ensure that all\nlinks, including those leading to external websites, are followed.\nsame-domain: Enqueue links that share the same domain name as the current page, including any subdomains.\nThis strategy is ideal for crawling within the same top-level domain while still allowing for subdomain\nexploration.\nsame-hostname: Enqueue links only if they match the exact hostname of the current page. This is the default\nbehavior and restricts the crawl to the current hostname, excluding subdomains.\nsame-origin: Enqueue links that share the same origin as the current page. The origin is defined by the\ncombination of protocol, domain, and port, ensuring a strict scope for the crawl." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 159 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Literal['all', 'same-domain', 'same-hostname', 'same-origin']" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "List of regular expressions or globs that URLs must match to be enqueued." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 161, + "module": "_types", + "name": "include", + "parsedDocstring": { + "text": "List of regular expressions or globs that URLs must match to be enqueued." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 174 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "re.Pattern" + }, + { + "type": "reference", + "name": "Glob", + "target": "2982" + } + ] + } + ], + "target": "866" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "List of regular expressions or globs that URLs must not match to be enqueued." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 162, + "module": "_types", + "name": "exclude", + "parsedDocstring": { + "text": "List of regular expressions or globs that URLs must not match to be enqueued." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 177 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "re.Pattern" + }, + { + "type": "reference", + "name": "Glob", + "target": "2982" + } + ] + } + ], + "target": "866" + } + ] + } + } + ], + "type": { + "name": "Coroutine", + "type": "reference", + "typeArguments": [ + { + "type": "literal", + "value": null + }, + { + "type": "literal", + "value": null + }, + { + "type": "literal", + "value": null + } + ] + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A function for enqueueing new URLs to crawl based on elements selected by a given selector or explicit requests.\n\nIt adds explicitly passed `requests` to the `RequestManager` or it extracts URLs from the current page and enqueues\nthem for further crawling. It allows filtering through selectors and other options. You can also specify labels and\nuser data to be associated with the newly created `Request` objects.\n\nIt should not be called with `selector`, `label`, `user_data` or `transform_request_function` arguments together\nwith `requests` argument.\n\nFor even more control over the enqueued links you can use combination of `ExtractLinksFunction` and\n`AddRequestsFunction`." + } + ] + }, + "decorations": [ + { + "args": "('Functions')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 236 + ], + "title": "Methods" + } + ], + "id": 235, + "module": "_types", + "name": "EnqueueLinksFunction", + "parsedDocstring": { + "text": "A function for enqueueing new URLs to crawl based on elements selected by a given selector or explicit requests.\n\nIt adds explicitly passed `requests` to the `RequestManager` or it extracts URLs from the current page and enqueues\nthem for further crawling. It allows filtering through selectors and other options. You can also specify labels and\nuser data to be associated with the newly created `Request` objects.\n\nIt should not be called with `selector`, `label`, `user_data` or `transform_request_function` arguments together\nwith `requests` argument.\n\nFor even more control over the enqueued links you can use combination of `ExtractLinksFunction` and\n`AddRequestsFunction`." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 329 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Call extract links function.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 254, + "module": "_types", + "name": "__call__", + "parsedDocstring": { + "text": "Call extract links function.\n", + "args": { + "selector": "A selector used to find the elements containing the links. The behaviour differs based\non the crawler used:\n- `PlaywrightCrawler` supports CSS and XPath selectors.\n- `ParselCrawler` supports CSS selectors.\n- `BeautifulSoupCrawler` supports CSS selectors.", + "label": "Label for the newly created `Request` objects, used for request routing.", + "user_data": "User data to be provided to the newly created `Request` objects.", + "transform_request_function": "A function that takes `RequestOptions` and returns either:\n- Modified `RequestOptions` to update the request configuration,\n- `'skip'` to exclude the request from being enqueued,\n- `'unchanged'` to use the original request options without modification.", + "**kwargs": "Additional keyword arguments." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 396 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Call extract links function.\n" + } + ] + }, + "flags": {}, + "id": 255, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__call__", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A selector used to find the elements containing the links. The behaviour differs based\non the crawler used:\n- `PlaywrightCrawler` supports CSS and XPath selectors.\n- `ParselCrawler` supports CSS selectors.\n- `BeautifulSoupCrawler` supports CSS selectors." + } + ] + }, + "defaultValue": "'a'", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 256, + "kind": 32768, + "kindString": "Parameter", + "name": "selector", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Label for the newly created `Request` objects, used for request routing." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 257, + "kind": 32768, + "kindString": "Parameter", + "name": "label", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "User data to be provided to the newly created `Request` objects." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 258, + "kind": 32768, + "kindString": "Parameter", + "name": "user_data", + "type": { + "name": "dict[str, Any] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "dict", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Any" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A function that takes `RequestOptions` and returns either:\n- Modified `RequestOptions` to update the request configuration,\n- `'skip'` to exclude the request from being enqueued,\n- `'unchanged'` to use the original request options without modification." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 259, + "kind": 32768, + "kindString": "Parameter", + "name": "transform_request_function", + "type": { + "name": "Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Callable", + "typeArguments": [ + { + "type": "reference", + "name": "[RequestOptions]" + }, + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "RequestOptions", + "target": "384" + }, + { + "type": "reference", + "name": "Literal['skip', 'unchanged']" + } + ] + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum number of requests to be enqueued." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 158, + "module": "_types", + "name": "limit", + "parsedDocstring": { + "text": "Maximum number of requests to be enqueued." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 153 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Base URL to be used for relative URLs." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 159, + "module": "_types", + "name": "base_url", + "parsedDocstring": { + "text": "Base URL to be used for relative URLs." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 156 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Enqueue strategy to be used for determining which links to extract and enqueue.\n\nOptions:\nall: Enqueue every link encountered, regardless of the target domain. Use this option to ensure that all\nlinks, including those leading to external websites, are followed.\nsame-domain: Enqueue links that share the same domain name as the current page, including any subdomains.\nThis strategy is ideal for crawling within the same top-level domain while still allowing for subdomain\nexploration.\nsame-hostname: Enqueue links only if they match the exact hostname of the current page. This is the default\nbehavior and restricts the crawl to the current hostname, excluding subdomains.\nsame-origin: Enqueue links that share the same origin as the current page. The origin is defined by the\ncombination of protocol, domain, and port, ensuring a strict scope for the crawl." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 160, + "module": "_types", + "name": "strategy", + "parsedDocstring": { + "text": "Enqueue strategy to be used for determining which links to extract and enqueue.\n\nOptions:\nall: Enqueue every link encountered, regardless of the target domain. Use this option to ensure that all\nlinks, including those leading to external websites, are followed.\nsame-domain: Enqueue links that share the same domain name as the current page, including any subdomains.\nThis strategy is ideal for crawling within the same top-level domain while still allowing for subdomain\nexploration.\nsame-hostname: Enqueue links only if they match the exact hostname of the current page. This is the default\nbehavior and restricts the crawl to the current hostname, excluding subdomains.\nsame-origin: Enqueue links that share the same origin as the current page. The origin is defined by the\ncombination of protocol, domain, and port, ensuring a strict scope for the crawl." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 159 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Literal['all', 'same-domain', 'same-hostname', 'same-origin']" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "List of regular expressions or globs that URLs must match to be enqueued." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 161, + "module": "_types", + "name": "include", + "parsedDocstring": { + "text": "List of regular expressions or globs that URLs must match to be enqueued." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 174 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "re.Pattern" + }, + { + "type": "reference", + "name": "Glob", + "target": "2982" + } + ] + } + ], + "target": "866" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "List of regular expressions or globs that URLs must not match to be enqueued." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 162, + "module": "_types", + "name": "exclude", + "parsedDocstring": { + "text": "List of regular expressions or globs that URLs must not match to be enqueued." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 177 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "re.Pattern" + }, + { + "type": "reference", + "name": "Glob", + "target": "2982" + } + ] + } + ], + "target": "866" + } + ] + } + } + ], + "type": { + "name": "Coroutine", + "type": "reference", + "typeArguments": [ + { + "type": "literal", + "value": null + }, + { + "type": "literal", + "value": null + }, + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "Request", + "target": "398" + } + ], + "target": "866" + } + ] + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A function for extracting URLs to crawl based on elements selected by a given selector.\n\nIt extracts URLs from the current page and allows filtering through selectors and other options. You can also\nspecify labels and user data to be associated with the newly created `Request` objects." + } + ] + }, + "decorations": [ + { + "args": "('Functions')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 254 + ], + "title": "Methods" + } + ], + "id": 253, + "module": "_types", + "name": "ExtractLinksFunction", + "parsedDocstring": { + "text": "A function for extracting URLs to crawl based on elements selected by a given selector.\n\nIt extracts URLs from the current page and allows filtering through selectors and other options. You can also\nspecify labels and user data to be associated with the newly created `Request` objects." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 389 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Call dunder method.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 262, + "module": "_types", + "name": "__call__", + "parsedDocstring": { + "text": "Call dunder method.\n", + "args": { + "dataset_id": "The ID of the `Dataset` to export data from.", + "dataset_name": "The name of the `Dataset` to export data from.", + "**kwargs": "Additional keyword arguments." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 431 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Call dunder method.\n" + } + ] + }, + "flags": {}, + "id": 263, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__call__", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The ID of the `Dataset` to export data from." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 264, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_id", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The name of the `Dataset` to export data from." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 265, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The key under which to save the data." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 638, + "module": "storages._dataset", + "name": "key", + "parsedDocstring": { + "text": "The key under which to save the data." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 72 + } + ], + "type": { + "name": "Required", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The format in which to export the data. Either 'json' or 'csv'." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 639, + "module": "storages._dataset", + "name": "content_type", + "parsedDocstring": { + "text": "The format in which to export the data. Either 'json' or 'csv'." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 75 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Literal", + "typeArguments": [ + { + "type": "literal", + "value": "json" + }, + { + "type": "literal", + "value": "csv" + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "ID of the key-value store to save the exported file." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 640, + "module": "storages._dataset", + "name": "to_key_value_store_id", + "parsedDocstring": { + "text": "ID of the key-value store to save the exported file." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 78 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Name of the key-value store to save the exported file." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 641, + "module": "storages._dataset", + "name": "to_key_value_store_name", + "parsedDocstring": { + "text": "Name of the key-value store to save the exported file." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 81 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + } + ], + "type": { + "name": "Coroutine", + "type": "reference", + "typeArguments": [ + { + "type": "literal", + "value": null + }, + { + "type": "literal", + "value": null + }, + { + "type": "literal", + "value": null + } + ] + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A function for exporting data from a `Dataset`.\n\nIt simplifies the process of exporting data from a `Dataset`. It opens the specified one and exports\nits content to a `KeyValueStore`." + } + ] + }, + "decorations": [ + { + "args": "('Functions')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 262 + ], + "title": "Methods" + } + ], + "id": 261, + "module": "_types", + "name": "ExportToFunction", + "parsedDocstring": { + "text": "A function for exporting data from a `Dataset`.\n\nIt simplifies the process of exporting data from a `Dataset`. It opens the specified one and exports\nits content to a `KeyValueStore`." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 424 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Call dunder method.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 268, + "module": "_types", + "name": "__call__", + "parsedDocstring": { + "text": "Call dunder method.\n", + "args": { + "dataset_id": "ID of the `Dataset` to get data from.", + "dataset_name": "Name of the `Dataset` to get data from.", + "**kwargs": "Additional keyword arguments.\n" + }, + "returns": "A page of retrieved items." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 454 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "A page of retrieved items." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Call dunder method.\n" + } + ] + }, + "flags": {}, + "id": 269, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__call__", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "ID of the `Dataset` to get data from." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 270, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_id", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Name of the `Dataset` to get data from." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 271, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Skip the specified number of items at the start." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 626, + "module": "storages._dataset", + "name": "offset", + "parsedDocstring": { + "text": "Skip the specified number of items at the start." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 35 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The maximum number of items to retrieve. Unlimited if None." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 627, + "module": "storages._dataset", + "name": "limit", + "parsedDocstring": { + "text": "The maximum number of items to retrieve. Unlimited if None." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 38 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return only non-empty items and excludes hidden fields. Shortcut for skip_hidden and skip_empty." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 628, + "module": "storages._dataset", + "name": "clean", + "parsedDocstring": { + "text": "Return only non-empty items and excludes hidden fields. Shortcut for skip_hidden and skip_empty." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 41 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Set to True to sort results in descending order." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 629, + "module": "storages._dataset", + "name": "desc", + "parsedDocstring": { + "text": "Set to True to sort results in descending order." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 44 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Fields to include in each item. Sorts fields as specified if provided." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 630, + "module": "storages._dataset", + "name": "fields", + "parsedDocstring": { + "text": "Fields to include in each item. Sorts fields as specified if provided." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 47 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ], + "target": "866" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Fields to exclude from each item." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 631, + "module": "storages._dataset", + "name": "omit", + "parsedDocstring": { + "text": "Fields to exclude from each item." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 50 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ], + "target": "866" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Unwind items by a specified array field, turning each element into a separate item." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 632, + "module": "storages._dataset", + "name": "unwind", + "parsedDocstring": { + "text": "Unwind items by a specified array field, turning each element into a separate item." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 53 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Exclude empty items from the results if True." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 633, + "module": "storages._dataset", + "name": "skip_empty", + "parsedDocstring": { + "text": "Exclude empty items from the results if True." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 56 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Exclude fields starting with '#' if True." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 634, + "module": "storages._dataset", + "name": "skip_hidden", + "parsedDocstring": { + "text": "Exclude fields starting with '#' if True." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 59 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Field to be flattened in returned items." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 635, + "module": "storages._dataset", + "name": "flatten", + "parsedDocstring": { + "text": "Field to be flattened in returned items." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 62 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ], + "target": "866" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Specify the dataset view to be used." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 636, + "module": "storages._dataset", + "name": "view", + "parsedDocstring": { + "text": "Specify the dataset view to be used." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 65 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + } + ], + "type": { + "name": "Coroutine", + "type": "reference", + "typeArguments": [ + { + "type": "literal", + "value": null + }, + { + "type": "literal", + "value": null + }, + { + "type": "reference", + "name": "DatasetItemsListPage", + "target": "823" + } + ] + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A function for retrieving data from a `Dataset`.\n\nIt simplifies the process of accessing data from a `Dataset`. It opens the specified one and retrieves\ndata based on the provided parameters. It allows filtering and pagination." + } + ] + }, + "decorations": [ + { + "args": "('Functions')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 268 + ], + "title": "Methods" + } + ], + "id": 267, + "module": "_types", + "name": "GetDataFunction", + "parsedDocstring": { + "text": "A function for retrieving data from a `Dataset`.\n\nIt simplifies the process of accessing data from a `Dataset`. It opens the specified one and retrieves\ndata based on the provided parameters. It allows filtering and pagination." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 447 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Call dunder method.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 274, + "module": "_types", + "name": "__call__", + "parsedDocstring": { + "text": "Call dunder method.\n", + "args": { + "id": "The ID of the `KeyValueStore` to get.", + "name": "The name of the `KeyValueStore` to get." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 479 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Call dunder method.\n" + } + ] + }, + "flags": {}, + "id": 275, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__call__", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The ID of the `KeyValueStore` to get." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 276, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The name of the `KeyValueStore` to get." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 277, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "Coroutine", + "type": "reference", + "typeArguments": [ + { + "type": "literal", + "value": null + }, + { + "type": "literal", + "value": null + }, + { + "type": "reference", + "name": "KeyValueStore", + "target": "569" + } + ] + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A function for accessing a `KeyValueStore`.\n\nIt retrieves an instance of a `KeyValueStore` based on its ID or name." + } + ] + }, + "decorations": [ + { + "args": "('Functions')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 274 + ], + "title": "Methods" + } + ], + "id": 273, + "module": "_types", + "name": "GetKeyValueStoreFunction", + "parsedDocstring": { + "text": "A function for accessing a `KeyValueStore`.\n\nIt retrieves an instance of a `KeyValueStore` based on its ID or name." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 473 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Call dunder method.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 279, + "module": "_types", + "name": "__call__", + "parsedDocstring": { + "text": "Call dunder method.\n", + "args": { + "id": "The ID of the `KeyValueStore` to get.", + "name": "The name of the `KeyValueStore` to get." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 499 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Call dunder method.\n" + } + ] + }, + "flags": {}, + "id": 280, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__call__", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The ID of the `KeyValueStore` to get." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 281, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The name of the `KeyValueStore` to get." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 282, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "Coroutine", + "type": "reference", + "typeArguments": [ + { + "type": "literal", + "value": null + }, + { + "type": "literal", + "value": null + }, + { + "type": "reference", + "name": "KeyValueStoreInterface", + "target": "170" + } + ] + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A function for accessing a `KeyValueStore`.\n\nIt retrieves an instance of a `KeyValueStore` based on its ID or name." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 279 + ], + "title": "Methods" + } + ], + "id": 278, + "module": "_types", + "name": "GetKeyValueStoreFromRequestHandlerFunction", + "parsedDocstring": { + "text": "A function for accessing a `KeyValueStore`.\n\nIt retrieves an instance of a `KeyValueStore` based on its ID or name." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 493 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Call dunder method.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 284, + "module": "_types", + "name": "__call__", + "parsedDocstring": { + "text": "Call dunder method.\n", + "args": { + "data": "The data to push to the `Dataset`.", + "dataset_id": "The ID of the `Dataset` to push the data to.", + "dataset_name": "The name of the `Dataset` to push the data to.", + "**kwargs": "Additional keyword arguments." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 521 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Call dunder method.\n" + } + ] + }, + "flags": {}, + "id": 285, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__call__", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The data to push to the `Dataset`." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 286, + "kind": 32768, + "kindString": "Parameter", + "name": "data", + "type": { + "name": "JsonSerializable", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The ID of the `Dataset` to push the data to." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 287, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_id", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The name of the `Dataset` to push the data to." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 288, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "Coroutine", + "type": "reference", + "typeArguments": [ + { + "type": "literal", + "value": null + }, + { + "type": "literal", + "value": null + }, + { + "type": "literal", + "value": null + } + ] + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A function for pushing data to a `Dataset`.\n\nIt simplifies the process of adding data to a `Dataset`. It opens the specified one and pushes\nthe provided data to it." + } + ] + }, + "decorations": [ + { + "args": "('Functions')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 284 + ], + "title": "Methods" + } + ], + "id": 283, + "module": "_types", + "name": "PushDataFunction", + "parsedDocstring": { + "text": "A function for pushing data to a `Dataset`.\n\nIt simplifies the process of adding data to a `Dataset`. It opens the specified one and pushes\nthe provided data to it." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 514 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Call send request function.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 291, + "module": "_types", + "name": "__call__", + "parsedDocstring": { + "text": "Call send request function.\n", + "args": { + "url": "The URL to send the request to.", + "method": "The HTTP method to use.", + "headers": "The headers to include in the request.", + "payload": "The payload to include in the request.\n" + }, + "returns": "The HTTP response received from the server." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 546 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "The HTTP response received from the server." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Call send request function.\n" + } + ] + }, + "flags": {}, + "id": 292, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__call__", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The URL to send the request to." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 293, + "kind": 32768, + "kindString": "Parameter", + "name": "url", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The HTTP method to use." + } + ] + }, + "defaultValue": "'GET'", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 294, + "kind": 32768, + "kindString": "Parameter", + "name": "method", + "type": { + "name": "Literal", + "type": "reference", + "typeArguments": [ + { + "type": "literal", + "value": "GET" + }, + { + "type": "literal", + "value": "HEAD" + }, + { + "type": "literal", + "value": "POST" + }, + { + "type": "literal", + "value": "PUT" + }, + { + "type": "literal", + "value": "DELETE" + }, + { + "type": "literal", + "value": "CONNECT" + }, + { + "type": "literal", + "value": "OPTIONS" + }, + { + "type": "literal", + "value": "TRACE" + }, + { + "type": "literal", + "value": "PATCH" + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The payload to include in the request.\n" + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 295, + "kind": 32768, + "kindString": "Parameter", + "name": "payload", + "type": { + "name": "HttpPayload | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "bytes", + "target": "3048" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The headers to include in the request." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 296, + "kind": 32768, + "kindString": "Parameter", + "name": "headers", + "type": { + "name": "HttpHeaders | dict[str, str] | None", + "type": "union", + "types": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "HttpHeaders", + "target": "123" + }, + { + "type": "reference", + "name": "dict", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "str" + } + ] + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "Coroutine", + "type": "reference", + "typeArguments": [ + { + "type": "literal", + "value": null + }, + { + "type": "literal", + "value": null + }, + { + "type": "reference", + "name": "HttpResponse", + "target": "1909" + } + ] + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A function for sending HTTP requests.\n\nIt simplifies the process of sending HTTP requests. It is implemented by the crawling context and is used\nwithin request handlers to send additional HTTP requests to target URLs." + } + ] + }, + "decorations": [ + { + "args": "('Functions')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 291 + ], + "title": "Methods" + } + ], + "id": 290, + "module": "_types", + "name": "SendRequestFunction", + "parsedDocstring": { + "text": "A function for sending HTTP requests.\n\nIt simplifies the process of sending HTTP requests. It is implemented by the crawling context and is used\nwithin request handlers to send additional HTTP requests to target URLs." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 539 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Screenshot of the page format." + } + ] + }, + "flags": {}, + "groups": [], + "id": 298, + "module": "_types", + "name": "screenshot", + "parsedDocstring": { + "text": "Screenshot of the page format." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 572 + } + ], + "type": { + "name": "bytes | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "bytes", + "target": "3048" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTML content of the page." + } + ] + }, + "flags": {}, + "groups": [], + "id": 299, + "module": "_types", + "name": "html", + "parsedDocstring": { + "text": "HTML content of the page." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 575 + } + ], + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 300, + "module": "_types", + "name": "__bool__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 578 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 301, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__bool__", + "parameters": [], + "type": { + "name": "bool", + "type": "reference" + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Snapshot of a crawled page." + } + ] + }, + "decorations": [ + { + "args": "('Data structures')", + "name": "docs_group" + }, + { + "args": ".dataclass", + "name": "dataclasses" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 300 + ], + "title": "Methods" + }, + { + "children": [ + 299, + 298 + ], + "title": "Properties" + } + ], + "id": 297, + "module": "_types", + "name": "PageSnapshot", + "parsedDocstring": { + "text": "Snapshot of a crawled page." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 569 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get page snapshot.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 303, + "module": "_types", + "name": "__call__", + "parsedDocstring": { + "text": "Get page snapshot.\n", + "returns": "Snapshot of a page." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 586 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "Snapshot of a page." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Get page snapshot.\n" + } + ] + }, + "flags": {}, + "id": 304, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__call__", + "parameters": [], + "type": { + "name": "Coroutine", + "type": "reference", + "typeArguments": [ + { + "type": "literal", + "value": null + }, + { + "type": "literal", + "value": null + }, + { + "type": "reference", + "name": "PageSnapshot", + "target": "297" + } + ] + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A function for getting snapshot of a page." + } + ] + }, + "decorations": [ + { + "args": "('Functions')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 303 + ], + "title": "Methods" + } + ], + "id": 302, + "module": "_types", + "name": "GetPageSnapshot", + "parsedDocstring": { + "text": "A function for getting snapshot of a page." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 583 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Call dunder method.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 306, + "module": "_types", + "name": "__call__", + "parsedDocstring": { + "text": "Call dunder method.\n", + "args": { + "default_value": "The default value to initialize the state if it is not already set.\n" + }, + "returns": "The current state." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 604 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "The current state." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Call dunder method.\n" + } + ] + }, + "flags": {}, + "id": 307, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__call__", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The default value to initialize the state if it is not already set.\n" + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 308, + "kind": 32768, + "kindString": "Parameter", + "name": "default_value", + "type": { + "name": "dict[str, JsonSerializable] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "dict", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "JsonSerializable" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "Coroutine", + "type": "reference", + "typeArguments": [ + { + "type": "literal", + "value": null + }, + { + "type": "literal", + "value": null + }, + { + "type": "reference", + "name": "dict", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "JsonSerializable" + } + ] + } + ] + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A function for managing state within the crawling context.\n\nIt allows the use of persistent state across multiple crawls.\n\n\n:::warning Warning\nThis is an experimental feature. The behavior and interface may change in future versions.\n:::" + } + ] + }, + "decorations": [ + { + "args": "('Functions')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 306 + ], + "title": "Methods" + } + ], + "id": 305, + "module": "_types", + "name": "UseStateFunction", + "parsedDocstring": { + "text": "A function for managing state within the crawling context.\n\nIt allows the use of persistent state across multiple crawls.\n\n\n:::warning Warning\nThis is an experimental feature. The behavior and interface may change in future versions.\n:::" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 595 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Request object for the current page being processed." + } + ] + }, + "flags": {}, + "groups": [], + "id": 310, + "module": "_types", + "name": "request", + "parsedDocstring": { + "text": "Request object for the current page being processed." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 627 + } + ], + "type": { + "name": "Request", + "type": "reference", + "target": "398" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Session object for the current page being processed." + } + ] + }, + "flags": {}, + "groups": [], + "id": 311, + "module": "_types", + "name": "session", + "parsedDocstring": { + "text": "Session object for the current page being processed." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 630 + } + ], + "type": { + "name": "Session | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Session", + "target": "1575" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Proxy information for the current page being processed." + } + ] + }, + "flags": {}, + "groups": [], + "id": 312, + "module": "_types", + "name": "proxy_info", + "parsedDocstring": { + "text": "Proxy information for the current page being processed." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 633 + } + ], + "type": { + "name": "ProxyInfo | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ProxyInfo", + "target": "15" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Send request crawling context helper function." + } + ] + }, + "flags": {}, + "groups": [], + "id": 313, + "module": "_types", + "name": "send_request", + "parsedDocstring": { + "text": "Send request crawling context helper function." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 636 + } + ], + "type": { + "name": "SendRequestFunction", + "type": "reference", + "target": "290" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Add requests crawling context helper function." + } + ] + }, + "flags": {}, + "groups": [], + "id": 314, + "module": "_types", + "name": "add_requests", + "parsedDocstring": { + "text": "Add requests crawling context helper function." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 639 + } + ], + "type": { + "name": "AddRequestsFunction", + "type": "reference", + "target": "230" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Push data crawling context helper function." + } + ] + }, + "flags": {}, + "groups": [], + "id": 315, + "module": "_types", + "name": "push_data", + "parsedDocstring": { + "text": "Push data crawling context helper function." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 642 + } + ], + "type": { + "name": "PushDataFunction", + "type": "reference", + "target": "283" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Use state crawling context helper function." + } + ] + }, + "flags": {}, + "groups": [], + "id": 316, + "module": "_types", + "name": "use_state", + "parsedDocstring": { + "text": "Use state crawling context helper function." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 645 + } + ], + "type": { + "name": "UseStateFunction", + "type": "reference", + "target": "305" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get key-value store crawling context helper function." + } + ] + }, + "flags": {}, + "groups": [], + "id": 317, + "module": "_types", + "name": "get_key_value_store", + "parsedDocstring": { + "text": "Get key-value store crawling context helper function." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 648 + } + ], + "type": { + "name": "GetKeyValueStoreFromRequestHandlerFunction", + "type": "reference", + "target": "278" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Logger instance." + } + ] + }, + "flags": {}, + "groups": [], + "id": 318, + "module": "_types", + "name": "log", + "parsedDocstring": { + "text": "Logger instance." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 651 + } + ], + "type": { + "name": "logging.Logger", + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get snapshot of crawled page." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 319, + "module": "_types", + "name": "get_snapshot", + "parsedDocstring": { + "text": "Get snapshot of crawled page." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 654 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get snapshot of crawled page." + } + ] + }, + "flags": {}, + "id": 320, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_snapshot", + "parameters": [], + "type": { + "name": "PageSnapshot", + "type": "reference", + "target": "297" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return hash of the context. Each context is considered unique." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 321, + "module": "_types", + "name": "__hash__", + "parsedDocstring": { + "text": "Return hash of the context. Each context is considered unique." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 658 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return hash of the context. Each context is considered unique." + } + ] + }, + "flags": {}, + "id": 322, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__hash__", + "parameters": [], + "type": { + "name": "int", + "type": "reference" + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Basic crawling context.\n\nIt represents the fundamental crawling context used by the `BasicCrawler`. It is extended by more\nspecific crawlers to provide additional functionality." + } + ] + }, + "decorations": [ + { + "args": "(frozen=True)", + "name": "dataclass" + }, + { + "args": "('Data structures')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 321, + 319 + ], + "title": "Methods" + }, + { + "children": [ + 314, + 317, + 318, + 312, + 315, + 310, + 313, + 311, + 316 + ], + "title": "Properties" + } + ], + "id": 309, + "module": "_types", + "name": "BasicCrawlingContext", + "parsedDocstring": { + "text": "Basic crawling context.\n\nIt represents the fundamental crawling context used by the `BasicCrawler`. It is extended by more\nspecific crawlers to provide additional functionality." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 620 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedBy": [ + { + "name": "PlaywrightPreNavCrawlingContext", + "target": "2179", + "type": "reference" + }, + { + "name": "AdaptivePlaywrightPreNavCrawlingContext", + "target": "2559", + "type": "reference" + }, + { + "name": "HttpCrawlingContext", + "target": "2625", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 324, + "module": "_service_locator", + "name": "__init__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_service_locator.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 17 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 325, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the configuration." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 326, + "module": "_service_locator", + "name": "get_configuration", + "parsedDocstring": { + "text": "Get the configuration." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_service_locator.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 27 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the configuration." + } + ] + }, + "flags": {}, + "id": 327, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "get_configuration", + "parameters": [], + "type": { + "name": "Configuration", + "type": "reference", + "target": "93" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Set the configuration.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 328, + "module": "_service_locator", + "name": "set_configuration", + "parsedDocstring": { + "text": "Set the configuration.\n", + "args": { + "configuration": "The configuration to set.\n" + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_service_locator.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 35 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Set the configuration.\n" + } + ] + }, + "flags": {}, + "id": 329, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "set_configuration", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The configuration to set.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 330, + "kind": 32768, + "kindString": "Parameter", + "name": "configuration", + "type": { + "name": "Configuration", + "type": "reference", + "target": "93" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the event manager." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 331, + "module": "_service_locator", + "name": "get_event_manager", + "parsedDocstring": { + "text": "Get the event manager." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_service_locator.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 49 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the event manager." + } + ] + }, + "flags": {}, + "id": 332, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "get_event_manager", + "parameters": [], + "type": { + "name": "EventManager", + "type": "reference", + "target": "2086" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Set the event manager.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 333, + "module": "_service_locator", + "name": "set_event_manager", + "parsedDocstring": { + "text": "Set the event manager.\n", + "args": { + "event_manager": "The event manager to set.\n" + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_service_locator.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 63 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Set the event manager.\n" + } + ] + }, + "flags": {}, + "id": 334, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "set_event_manager", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The event manager to set.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 335, + "kind": 32768, + "kindString": "Parameter", + "name": "event_manager", + "type": { + "name": "EventManager", + "type": "reference", + "target": "2086" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the storage client." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 336, + "module": "_service_locator", + "name": "get_storage_client", + "parsedDocstring": { + "text": "Get the storage client." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_service_locator.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 77 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the storage client." + } + ] + }, + "flags": {}, + "id": 337, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "get_storage_client", + "parameters": [], + "type": { + "name": "StorageClient", + "type": "reference", + "target": "1189" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Set the storage client.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 338, + "module": "_service_locator", + "name": "set_storage_client", + "parsedDocstring": { + "text": "Set the storage client.\n", + "args": { + "storage_client": "The storage client to set.\n" + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_service_locator.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 91 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Set the storage client.\n" + } + ] + }, + "flags": {}, + "id": 339, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "set_storage_client", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The storage client to set.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 340, + "kind": 32768, + "kindString": "Parameter", + "name": "storage_client", + "type": { + "name": "StorageClient", + "type": "reference", + "target": "1189" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Service locator for managing the services used by Crawlee.\n\nAll services are initialized to its default value lazily." + } + ] + }, + "decorations": [ + { + "args": "('Classes')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 324, + 326, + 331, + 336, + 328, + 333, + 338 + ], + "title": "Methods" + } + ], + "id": 323, + "module": "_service_locator", + "name": "ServiceLocator", + "parsedDocstring": { + "text": "Service locator for managing the services used by Crawlee.\n\nAll services are initialized to its default value lazily." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_service_locator.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 11 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 341, + "module": "_service_locator", + "name": "service_locator", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_service_locator.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 106 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 343, + "module": "_request", + "name": "UNPROCESSED", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 24 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 344, + "module": "_request", + "name": "BEFORE_NAV", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 25 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 345, + "module": "_request", + "name": "AFTER_NAV", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 26 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 346, + "module": "_request", + "name": "REQUEST_HANDLER", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 27 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 347, + "module": "_request", + "name": "DONE", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 28 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 348, + "module": "_request", + "name": "ERROR_HANDLER", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 29 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 349, + "module": "_request", + "name": "ERROR", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 30 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 350, + "module": "_request", + "name": "SKIPPED", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 31 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Crawlee-specific request handling state." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 345, + 344, + 347, + 349, + 348, + 346, + 350, + 343 + ], + "title": "Properties" + } + ], + "id": 342, + "module": "_request", + "name": "RequestState", + "parsedDocstring": { + "text": "Crawlee-specific request handling state." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 21 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum number of retries for this request. Allows to override the global `max_request_retries` option of\n`BasicCrawler`." + } + ] + }, + "flags": {}, + "groups": [], + "id": 352, + "module": "_request", + "name": "max_retries", + "parsedDocstring": { + "text": "Maximum number of retries for this request. Allows to override the global `max_request_retries` option of\n`BasicCrawler`." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 37 + } + ], + "type": { + "name": "Annotated[int | None, Field(alias='maxRetries')]", + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The strategy that was used for enqueuing the request." + } + ] + }, + "flags": {}, + "groups": [], + "id": 353, + "module": "_request", + "name": "enqueue_strategy", + "parsedDocstring": { + "text": "The strategy that was used for enqueuing the request." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 41 + } + ], + "type": { + "name": "Annotated[EnqueueStrategy | None, Field(alias='enqueueStrategy')]", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Literal['all', 'same-domain', 'same-hostname', 'same-origin']" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Describes the request's current lifecycle state." + } + ] + }, + "flags": {}, + "groups": [], + "id": 354, + "module": "_request", + "name": "state", + "parsedDocstring": { + "text": "Describes the request's current lifecycle state." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 44 + } + ], + "type": { + "name": "RequestState | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "RequestState", + "target": "342" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The number of finished session rotations for this request." + } + ] + }, + "flags": {}, + "groups": [], + "id": 355, + "module": "_request", + "name": "session_rotation_count", + "parsedDocstring": { + "text": "The number of finished session rotations for this request." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 47 + } + ], + "type": { + "name": "Annotated[int | None, Field(alias='sessionRotationCount')]", + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 356, + "module": "_request", + "name": "skip_navigation", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 50 + } + ], + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The last proxy tier used to process the request." + } + ] + }, + "flags": {}, + "groups": [], + "id": 357, + "module": "_request", + "name": "last_proxy_tier", + "parsedDocstring": { + "text": "The last proxy tier used to process the request." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 52 + } + ], + "type": { + "name": "Annotated[int | None, Field(alias='lastProxyTier')]", + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Indicate whether the request should be enqueued at the front of the queue." + } + ] + }, + "flags": {}, + "groups": [], + "id": 358, + "module": "_request", + "name": "forefront", + "parsedDocstring": { + "text": "Indicate whether the request should be enqueued at the front of the queue." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 55 + } + ], + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The depth of the request in the crawl tree." + } + ] + }, + "flags": {}, + "groups": [], + "id": 359, + "module": "_request", + "name": "crawl_depth", + "parsedDocstring": { + "text": "The depth of the request in the crawl tree." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 58 + } + ], + "type": { + "name": "int", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "ID of a session to which the request is bound." + } + ] + }, + "flags": {}, + "groups": [], + "id": 360, + "module": "_request", + "name": "session_id", + "parsedDocstring": { + "text": "ID of a session to which the request is bound." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 61 + } + ], + "type": { + "name": "Annotated[str | None, Field()]", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Crawlee-specific configuration stored in the `user_data`." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 359, + 353, + 358, + 357, + 352, + 360, + 355, + 356, + 354 + ], + "title": "Properties" + } + ], + "id": 351, + "module": "_request", + "name": "CrawleeRequestData", + "parsedDocstring": { + "text": "Crawlee-specific configuration stored in the `user_data`." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 34 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 362, + "module": "_request", + "name": "model_config", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 72 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 363, + "module": "_request", + "name": "__pydantic_extra__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 73 + } + ], + "type": { + "name": "dict", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "JsonSerializable" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Crawlee-specific configuration stored in the `user_data`." + } + ] + }, + "flags": {}, + "groups": [], + "id": 364, + "module": "_request", + "name": "crawlee_data", + "parsedDocstring": { + "text": "Crawlee-specific configuration stored in the `user_data`." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 75 + } + ], + "type": { + "name": "Annotated[CrawleeRequestData | None, Field(alias='__crawlee')]", + "type": "union", + "types": [ + { + "type": "reference", + "name": "CrawleeRequestData", + "target": "351" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Label used for request routing." + } + ] + }, + "flags": {}, + "groups": [], + "id": 365, + "module": "_request", + "name": "label", + "parsedDocstring": { + "text": "Label used for request routing." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 78 + } + ], + "type": { + "name": "Annotated[str | None, Field()]", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 366, + "module": "_request", + "name": "__getitem__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 81 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 367, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__getitem__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 368, + "kind": 32768, + "kindString": "Parameter", + "name": "key", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "JsonSerializable", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 369, + "module": "_request", + "name": "__setitem__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 84 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 370, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__setitem__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 371, + "kind": 32768, + "kindString": "Parameter", + "name": "key", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 372, + "kind": 32768, + "kindString": "Parameter", + "name": "value", + "type": { + "name": "JsonSerializable", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 373, + "module": "_request", + "name": "__delitem__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 93 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 374, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__delitem__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 375, + "kind": 32768, + "kindString": "Parameter", + "name": "key", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 376, + "module": "_request", + "name": "__iter__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 96 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 377, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__iter__", + "parameters": [], + "type": { + "name": "Iterator", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 378, + "module": "_request", + "name": "__len__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 99 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 379, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__len__", + "parameters": [], + "type": { + "name": "int", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 380, + "module": "_request", + "name": "__eq__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 102 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 381, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__eq__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 382, + "kind": 32768, + "kindString": "Parameter", + "name": "other", + "type": { + "name": "object", + "type": "reference" + } + } + ], + "type": { + "name": "bool", + "type": "reference" + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Represents the `user_data` part of a Request.\n\nApart from the well-known attributes (`label` and `__crawlee`), it can also contain arbitrary JSON-compatible\nvalues." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 373, + 380, + 366, + 376, + 378, + 369 + ], + "title": "Methods" + }, + { + "children": [ + 363, + 364, + 365, + 362 + ], + "title": "Properties" + } + ], + "id": 361, + "module": "_request", + "name": "UserData", + "parsedDocstring": { + "text": "Represents the `user_data` part of a Request.\n\nApart from the well-known attributes (`label` and `__crawlee`), it can also contain arbitrary JSON-compatible\nvalues." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 65 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 383, + "module": "_request", + "name": "user_data_adapter", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 112 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 385, + "module": "_request", + "name": "url", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 121 + } + ], + "type": { + "name": "Required", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 386, + "module": "_request", + "name": "method", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 122 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Literal['GET', 'HEAD', 'POST', 'PUT', 'DELETE', 'CONNECT', 'OPTIONS', 'TRACE', 'PATCH']" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 387, + "module": "_request", + "name": "headers", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 123 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "HttpHeaders", + "target": "123" + }, + { + "type": "reference", + "name": "dict", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "str" + } + ] + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 388, + "module": "_request", + "name": "payload", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 124 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "bytes", + "target": "3048" + }, + { + "type": "reference", + "name": "str" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 389, + "module": "_request", + "name": "label", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 125 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 390, + "module": "_request", + "name": "session_id", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 126 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 391, + "module": "_request", + "name": "unique_key", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 127 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 392, + "module": "_request", + "name": "id", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 128 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 393, + "module": "_request", + "name": "keep_url_fragment", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 129 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 394, + "module": "_request", + "name": "use_extended_unique_key", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 130 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 395, + "module": "_request", + "name": "always_enqueue", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 131 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 396, + "module": "_request", + "name": "user_data", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 132 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "dict", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "JsonSerializable" + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 397, + "module": "_request", + "name": "no_retry", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 133 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Options that can be used to customize request creation.\n\nThis type exactly matches the parameters of `Request.from_url` method." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 395, + 387, + 392, + 393, + 389, + 386, + 397, + 388, + 390, + 391, + 385, + 394, + 396 + ], + "title": "Properties" + } + ], + "id": 384, + "module": "_request", + "name": "RequestOptions", + "parsedDocstring": { + "text": "Options that can be used to customize request creation.\n\nThis type exactly matches the parameters of `Request.from_url` method." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 115 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 399, + "module": "_request", + "name": "model_config", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 161 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The URL of the web page to crawl. Must be a valid HTTP or HTTPS URL, and may include query parameters\nand fragments." + } + ] + }, + "flags": {}, + "groups": [], + "id": 400, + "module": "_request", + "name": "url", + "parsedDocstring": { + "text": "The URL of the web page to crawl. Must be a valid HTTP or HTTPS URL, and may include query parameters\nand fragments." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 163 + } + ], + "type": { + "name": "str", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP request method." + } + ] + }, + "flags": {}, + "groups": [], + "id": 401, + "module": "_request", + "name": "method", + "parsedDocstring": { + "text": "HTTP request method." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 167 + } + ], + "type": { + "name": "Literal", + "type": "reference", + "typeArguments": [ + { + "type": "literal", + "value": "GET" + }, + { + "type": "literal", + "value": "HEAD" + }, + { + "type": "literal", + "value": "POST" + }, + { + "type": "literal", + "value": "PUT" + }, + { + "type": "literal", + "value": "DELETE" + }, + { + "type": "literal", + "value": "CONNECT" + }, + { + "type": "literal", + "value": "OPTIONS" + }, + { + "type": "literal", + "value": "TRACE" + }, + { + "type": "literal", + "value": "PATCH" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP request headers." + } + ] + }, + "flags": {}, + "groups": [], + "id": 402, + "module": "_request", + "name": "headers", + "parsedDocstring": { + "text": "HTTP request headers." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 170 + } + ], + "type": { + "name": "HttpHeaders", + "type": "reference", + "target": "123" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP request payload." + } + ] + }, + "flags": {}, + "groups": [], + "id": 403, + "module": "_request", + "name": "payload", + "parsedDocstring": { + "text": "HTTP request payload." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 173 + } + ], + "type": { + "name": "Annotated[ HttpPayload | None, BeforeValidator(lambda v: v.encode() if isinstance(v, str) else v), PlainSerializer(lambda v: v.decode() if isinstance(v, bytes) else v), ]", + "type": "union", + "types": [ + { + "type": "reference", + "name": "bytes", + "target": "3048" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Custom user data assigned to the request. Use this to save any request related data to the\nrequest's scope, keeping them accessible on retries, failures etc." + } + ] + }, + "flags": {}, + "groups": [], + "id": 404, + "module": "_request", + "name": "user_data", + "parsedDocstring": { + "text": "Custom user data assigned to the request. Use this to save any request related data to the\nrequest's scope, keeping them accessible on retries, failures etc." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 180 + } + ], + "type": { + "name": "dict", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "JsonSerializable" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Number of times the request has been retried." + } + ] + }, + "flags": {}, + "groups": [], + "id": 405, + "module": "_request", + "name": "retry_count", + "parsedDocstring": { + "text": "Number of times the request has been retried." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 198 + } + ], + "type": { + "name": "int", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If set to `True`, the request will not be retried in case of failure." + } + ] + }, + "flags": {}, + "groups": [], + "id": 406, + "module": "_request", + "name": "no_retry", + "parsedDocstring": { + "text": "If set to `True`, the request will not be retried in case of failure." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 201 + } + ], + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "URL of the web page that was loaded. This can differ from the original URL in case of redirects." + } + ] + }, + "flags": {}, + "groups": [], + "id": 407, + "module": "_request", + "name": "loaded_url", + "parsedDocstring": { + "text": "URL of the web page that was loaded. This can differ from the original URL in case of redirects." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 204 + } + ], + "type": { + "name": "Annotated[str | None, BeforeValidator(validate_http_url), Field(alias='loadedUrl')]", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Timestamp when the request was handled." + } + ] + }, + "flags": {}, + "groups": [], + "id": 408, + "module": "_request", + "name": "handled_at", + "parsedDocstring": { + "text": "Timestamp when the request was handled." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 207 + } + ], + "type": { + "name": "Annotated[datetime | None, Field(alias='handledAt')]", + "type": "union", + "types": [ + { + "type": "reference", + "name": "datetime" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A unique key identifying the request. Two requests with the same `unique_key` are considered as pointing\nto the same URL.\n\nIf `unique_key` is not provided, then it is automatically generated by normalizing the URL.\nFor example, the URL of `HTTP://www.EXAMPLE.com/something/` will produce the `unique_key`\nof `http://www.example.com/something`.\n\nPass an arbitrary non-empty text value to the `unique_key` property\nto override the default behavior and specify which URLs shall be considered equal." + } + ] + }, + "flags": {}, + "groups": [], + "id": 409, + "module": "_request", + "name": "unique_key", + "parsedDocstring": { + "text": "A unique key identifying the request. Two requests with the same `unique_key` are considered as pointing\nto the same URL.\n\nIf `unique_key` is not provided, then it is automatically generated by normalizing the URL.\nFor example, the URL of `HTTP://www.EXAMPLE.com/something/` will produce the `unique_key`\nof `http://www.example.com/something`.\n\nPass an arbitrary non-empty text value to the `unique_key` property\nto override the default behavior and specify which URLs shall be considered equal." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 210 + } + ], + "type": { + "name": "str", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A unique identifier for the request. Note that this is not used for deduplication, and should not be confused\nwith `unique_key`." + } + ] + }, + "flags": {}, + "groups": [], + "id": 410, + "module": "_request", + "name": "id", + "parsedDocstring": { + "text": "A unique identifier for the request. Note that this is not used for deduplication, and should not be confused\nwith `unique_key`." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 222 + } + ], + "type": { + "name": "str", + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a new `Request` instance from a URL.\n\nThis is recommended constructor for creating new `Request` instances. It generates a `Request` object from\na given URL with additional options to customize HTTP method, payload, unique key, and other request\nproperties. If no `unique_key` or `id` is provided, they are computed automatically based on the URL,\nmethod and payload. It depends on the `keep_url_fragment` and `use_extended_unique_key` flags.\n" + } + ] + }, + "decorations": [ + { + "name": "classmethod" + } + ], + "flags": {}, + "groups": [], + "id": 411, + "module": "_request", + "name": "from_url", + "parsedDocstring": { + "text": "Create a new `Request` instance from a URL.\n\nThis is recommended constructor for creating new `Request` instances. It generates a `Request` object from\na given URL with additional options to customize HTTP method, payload, unique key, and other request\nproperties. If no `unique_key` or `id` is provided, they are computed automatically based on the URL,\nmethod and payload. It depends on the `keep_url_fragment` and `use_extended_unique_key` flags.\n", + "args": { + "url": "The URL of the request.", + "method": "The HTTP method of the request.", + "headers": "The HTTP headers of the request.", + "payload": "The data to be sent as the request body. Typically used with 'POST' or 'PUT' requests.", + "label": "A custom label to differentiate between request types. This is stored in `user_data`, and it is\nused for request routing (different requests go to different handlers).", + "session_id": "ID of a specific `Session` to which the request will be strictly bound.\nIf the session becomes unavailable when the request is processed, a `RequestCollisionError` will be\nraised.", + "unique_key": "A unique key identifying the request. If not provided, it is automatically computed based on\nthe URL and other parameters. Requests with the same `unique_key` are treated as identical.", + "id": "A unique identifier for the request. If not provided, it is automatically generated from the\n`unique_key`.", + "keep_url_fragment": "Determines whether the URL fragment (e.g., ``section``) should be included in\nthe `unique_key` computation. This is only relevant when `unique_key` is not provided.", + "use_extended_unique_key": "Determines whether to include the HTTP method, ID Session and payload in the\n`unique_key` computation. This is only relevant when `unique_key` is not provided.", + "always_enqueue": "If set to `True`, the request will be enqueued even if it is already present in the queue.\nUsing this is not allowed when a custom `unique_key` is also provided and will result in a `ValueError`.", + "**kwargs": "Additional request properties." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 227 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a new `Request` instance from a URL.\n\nThis is recommended constructor for creating new `Request` instances. It generates a `Request` object from\na given URL with additional options to customize HTTP method, payload, unique key, and other request\nproperties. If no `unique_key` or `id` is provided, they are computed automatically based on the URL,\nmethod and payload. It depends on the `keep_url_fragment` and `use_extended_unique_key` flags.\n" + } + ] + }, + "flags": {}, + "id": 412, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "from_url", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The URL of the request." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 413, + "kind": 32768, + "kindString": "Parameter", + "name": "url", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The HTTP method of the request." + } + ] + }, + "defaultValue": "'GET'", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 414, + "kind": 32768, + "kindString": "Parameter", + "name": "method", + "type": { + "name": "Literal", + "type": "reference", + "typeArguments": [ + { + "type": "literal", + "value": "GET" + }, + { + "type": "literal", + "value": "HEAD" + }, + { + "type": "literal", + "value": "POST" + }, + { + "type": "literal", + "value": "PUT" + }, + { + "type": "literal", + "value": "DELETE" + }, + { + "type": "literal", + "value": "CONNECT" + }, + { + "type": "literal", + "value": "OPTIONS" + }, + { + "type": "literal", + "value": "TRACE" + }, + { + "type": "literal", + "value": "PATCH" + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The HTTP headers of the request." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 415, + "kind": 32768, + "kindString": "Parameter", + "name": "headers", + "type": { + "name": "HttpHeaders | dict[str, str] | None", + "type": "union", + "types": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "HttpHeaders", + "target": "123" + }, + { + "type": "reference", + "name": "dict", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "str" + } + ] + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The data to be sent as the request body. Typically used with 'POST' or 'PUT' requests." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 416, + "kind": 32768, + "kindString": "Parameter", + "name": "payload", + "type": { + "name": "HttpPayload | str | None", + "type": "union", + "types": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "bytes", + "target": "3048" + }, + { + "type": "reference", + "name": "str" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A custom label to differentiate between request types. This is stored in `user_data`, and it is\nused for request routing (different requests go to different handlers)." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 417, + "kind": 32768, + "kindString": "Parameter", + "name": "label", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "ID of a specific `Session` to which the request will be strictly bound.\nIf the session becomes unavailable when the request is processed, a `RequestCollisionError` will be\nraised." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 418, + "kind": 32768, + "kindString": "Parameter", + "name": "session_id", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A unique key identifying the request. If not provided, it is automatically computed based on\nthe URL and other parameters. Requests with the same `unique_key` are treated as identical." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 419, + "kind": 32768, + "kindString": "Parameter", + "name": "unique_key", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A unique identifier for the request. If not provided, it is automatically generated from the\n`unique_key`." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 420, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Determines whether the URL fragment (e.g., ``section``) should be included in\nthe `unique_key` computation. This is only relevant when `unique_key` is not provided." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 421, + "kind": 32768, + "kindString": "Parameter", + "name": "keep_url_fragment", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Determines whether to include the HTTP method, ID Session and payload in the\n`unique_key` computation. This is only relevant when `unique_key` is not provided." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 422, + "kind": 32768, + "kindString": "Parameter", + "name": "use_extended_unique_key", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "If set to `True`, the request will be enqueued even if it is already present in the queue.\nUsing this is not allowed when a custom `unique_key` is also provided and will result in a `ValueError`." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 423, + "kind": 32768, + "kindString": "Parameter", + "name": "always_enqueue", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 424, + "kind": 32768, + "kindString": "Parameter", + "name": "kwargs", + "type": { + "name": "Any", + "type": "reference" + } + } + ], + "type": { + "name": "Self", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the value of a specific query parameter from the URL." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 425, + "module": "_request", + "name": "get_query_param_from_url", + "parsedDocstring": { + "text": "Get the value of a specific query parameter from the URL." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 314 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the value of a specific query parameter from the URL." + } + ] + }, + "flags": {}, + "id": 426, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "get_query_param_from_url", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 427, + "kind": 32768, + "kindString": "Parameter", + "name": "param", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 428, + "kind": 32768, + "kindString": "Parameter", + "name": "default", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A string used to differentiate between arbitrary request types." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 429, + "module": "_request", + "name": "label", + "parsedDocstring": { + "text": "A string used to differentiate between arbitrary request types." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 320 + } + ], + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The ID of the bound session, if there is any." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 430, + "module": "_request", + "name": "session_id", + "parsedDocstring": { + "text": "The ID of the bound session, if there is any." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 325 + } + ], + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Crawlee-specific configuration stored in the `user_data`." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 431, + "module": "_request", + "name": "crawlee_data", + "parsedDocstring": { + "text": "Crawlee-specific configuration stored in the `user_data`." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 330 + } + ], + "type": { + "name": "CrawleeRequestData", + "type": "reference", + "target": "351" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The depth of the request in the crawl tree." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 432, + "module": "_request", + "name": "crawl_depth", + "parsedDocstring": { + "text": "The depth of the request in the crawl tree." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 339 + } + ], + "type": { + "name": "int", + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "args": ".setter", + "name": "crawl_depth" + } + ], + "flags": {}, + "groups": [], + "id": 433, + "module": "_request", + "name": "crawl_depth", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 344 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 434, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "crawl_depth", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 435, + "kind": 32768, + "kindString": "Parameter", + "name": "new_value", + "type": { + "name": "int", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Crawlee-specific request handling state." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 436, + "module": "_request", + "name": "state", + "parsedDocstring": { + "text": "Crawlee-specific request handling state." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 348 + } + ], + "type": { + "name": "RequestState | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "RequestState", + "target": "342" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "args": ".setter", + "name": "state" + } + ], + "flags": {}, + "groups": [], + "id": 437, + "module": "_request", + "name": "state", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 353 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 438, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "state", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 439, + "kind": 32768, + "kindString": "Parameter", + "name": "new_state", + "type": { + "name": "RequestState", + "type": "reference", + "target": "342" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Crawlee-specific limit on the number of retries of the request." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 440, + "module": "_request", + "name": "max_retries", + "parsedDocstring": { + "text": "Crawlee-specific limit on the number of retries of the request." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 357 + } + ], + "type": { + "name": "int | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "args": ".setter", + "name": "max_retries" + } + ], + "flags": {}, + "groups": [], + "id": 441, + "module": "_request", + "name": "max_retries", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 362 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 442, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "max_retries", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 443, + "kind": 32768, + "kindString": "Parameter", + "name": "new_max_retries", + "type": { + "name": "int", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Crawlee-specific number of finished session rotations for the request." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 444, + "module": "_request", + "name": "session_rotation_count", + "parsedDocstring": { + "text": "Crawlee-specific number of finished session rotations for the request." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 366 + } + ], + "type": { + "name": "int | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "args": ".setter", + "name": "session_rotation_count" + } + ], + "flags": {}, + "groups": [], + "id": 445, + "module": "_request", + "name": "session_rotation_count", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 371 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 446, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "session_rotation_count", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 447, + "kind": 32768, + "kindString": "Parameter", + "name": "new_session_rotation_count", + "type": { + "name": "int", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The strategy that was used for enqueuing the request." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 448, + "module": "_request", + "name": "enqueue_strategy", + "parsedDocstring": { + "text": "The strategy that was used for enqueuing the request." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 375 + } + ], + "type": { + "name": "Literal", + "type": "reference", + "typeArguments": [ + { + "type": "literal", + "value": "all" + }, + { + "type": "literal", + "value": "same-domain" + }, + { + "type": "literal", + "value": "same-hostname" + }, + { + "type": "literal", + "value": "same-origin" + } + ] + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "args": ".setter", + "name": "enqueue_strategy" + } + ], + "flags": {}, + "groups": [], + "id": 449, + "module": "_request", + "name": "enqueue_strategy", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 380 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 450, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "enqueue_strategy", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 451, + "kind": 32768, + "kindString": "Parameter", + "name": "new_enqueue_strategy", + "type": { + "name": "Literal", + "type": "reference", + "typeArguments": [ + { + "type": "literal", + "value": "all" + }, + { + "type": "literal", + "value": "same-domain" + }, + { + "type": "literal", + "value": "same-hostname" + }, + { + "type": "literal", + "value": "same-origin" + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The last proxy tier used to process the request." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 452, + "module": "_request", + "name": "last_proxy_tier", + "parsedDocstring": { + "text": "The last proxy tier used to process the request." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 384 + } + ], + "type": { + "name": "int | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "args": ".setter", + "name": "last_proxy_tier" + } + ], + "flags": {}, + "groups": [], + "id": 453, + "module": "_request", + "name": "last_proxy_tier", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 389 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 454, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "last_proxy_tier", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 455, + "kind": 32768, + "kindString": "Parameter", + "name": "new_value", + "type": { + "name": "int", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Indicate whether the request should be enqueued at the front of the queue." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 456, + "module": "_request", + "name": "forefront", + "parsedDocstring": { + "text": "Indicate whether the request should be enqueued at the front of the queue." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 393 + } + ], + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "args": ".setter", + "name": "forefront" + } + ], + "flags": {}, + "groups": [], + "id": 457, + "module": "_request", + "name": "forefront", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 398 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 458, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "forefront", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 459, + "kind": 32768, + "kindString": "Parameter", + "name": "new_value", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Represents a request in the Crawlee framework, containing the necessary information for crawling operations.\n\nThe `Request` class is one of the core components in Crawlee, utilized by various components such as request\nproviders, HTTP clients, crawlers, and more. It encapsulates the essential data for executing web requests,\nincluding the URL, HTTP method, headers, payload, and user data. The user data allows custom information\nto be stored and persisted throughout the request lifecycle, including its retries.\n\nKey functionalities include managing the request's identifier (`id`), unique key (`unique_key`) that is used\nfor request deduplication, controlling retries, handling state management, and enabling configuration for session\nrotation and proxy handling.\n\nThe recommended way to create a new instance is by using the `Request.from_url` constructor, which automatically\ngenerates a unique key and identifier based on the URL and request parameters.\n\n### Usage\n\n```python\nfrom crawlee import Request\n\nrequest = Request.from_url('https://crawlee.dev')\n```" + } + ] + }, + "decorations": [ + { + "args": "('Data structures')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 433, + 449, + 457, + 411, + 425, + 453, + 441, + 445, + 437 + ], + "title": "Methods" + }, + { + "children": [ + 432, + 431, + 448, + 456, + 408, + 402, + 410, + 429, + 452, + 407, + 440, + 401, + 399, + 406, + 403, + 405, + 430, + 444, + 436, + 409, + 400, + 404 + ], + "title": "Properties" + } + ], + "id": 398, + "module": "_request", + "name": "Request", + "parsedDocstring": { + "text": "Represents a request in the Crawlee framework, containing the necessary information for crawling operations.\n\nThe `Request` class is one of the core components in Crawlee, utilized by various components such as request\nproviders, HTTP clients, crawlers, and more. It encapsulates the essential data for executing web requests,\nincluding the URL, HTTP method, headers, payload, and user data. The user data allows custom information\nto be stored and persisted throughout the request lifecycle, including its retries.\n\nKey functionalities include managing the request's identifier (`id`), unique key (`unique_key`) that is used\nfor request deduplication, controlling retries, handling state management, and enabling configuration for session\nrotation and proxy handling.\n\nThe recommended way to create a new instance is by using the `Request.from_url` constructor, which automatically\ngenerates a unique key and identifier based on the URL and request parameters.\n\n### Usage\n\n```python\nfrom crawlee import Request\n\nrequest = Request.from_url('https://crawlee.dev')\n```" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 137 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedBy": [ + { + "name": "RequestWithLock", + "target": "460", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The timestamp when the lock expires." + } + ] + }, + "flags": {}, + "groups": [], + "id": 461, + "module": "_request", + "name": "lock_expires_at", + "parsedDocstring": { + "text": "The timestamp when the lock expires." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 405 + } + ], + "type": { + "name": "datetime", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3586, + "module": "_request", + "name": "model_config", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 161 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "inheritedFrom": { + "name": "Request.model_config", + "target": 399, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The URL of the web page to crawl. Must be a valid HTTP or HTTPS URL, and may include query parameters\nand fragments." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3587, + "module": "_request", + "name": "url", + "parsedDocstring": { + "text": "The URL of the web page to crawl. Must be a valid HTTP or HTTPS URL, and may include query parameters\nand fragments." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 163 + } + ], + "type": { + "name": "Annotated[str, BeforeValidator(validate_http_url), Field()]", + "type": "reference" + }, + "inheritedFrom": { + "name": "Request.url", + "target": 400, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP request method." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3588, + "module": "_request", + "name": "method", + "parsedDocstring": { + "text": "HTTP request method." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 167 + } + ], + "type": { + "name": "HttpMethod", + "type": "reference", + "target": "118" + }, + "inheritedFrom": { + "name": "Request.method", + "target": 401, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP request headers." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3589, + "module": "_request", + "name": "headers", + "parsedDocstring": { + "text": "HTTP request headers." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 170 + } + ], + "type": { + "name": "Annotated[HttpHeaders, Field(default_factory=HttpHeaders)]", + "type": "reference" + }, + "inheritedFrom": { + "name": "Request.headers", + "target": 402, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP request payload." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3590, + "module": "_request", + "name": "payload", + "parsedDocstring": { + "text": "HTTP request payload." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 173 + } + ], + "type": { + "name": "Annotated[ HttpPayload | None, BeforeValidator(lambda v: v.encode() if isinstance(v, str) else v), PlainSerializer(lambda v: v.decode() if isinstance(v, bytes) else v), ]", + "type": "reference" + }, + "inheritedFrom": { + "name": "Request.payload", + "target": 403, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Custom user data assigned to the request. Use this to save any request related data to the\nrequest's scope, keeping them accessible on retries, failures etc." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3591, + "module": "_request", + "name": "user_data", + "parsedDocstring": { + "text": "Custom user data assigned to the request. Use this to save any request related data to the\nrequest's scope, keeping them accessible on retries, failures etc." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 180 + } + ], + "type": { + "name": "Annotated[ dict[str, JsonSerializable], Field(alias='userData', default_factory=lambda: UserData()), PlainValidator(user_data_adapter.validate_python), PlainSerializer( lambda instance: user_data_adapter.dump_python( instance, by_alias=True, exclude_none=True, exclude_unset=True, exclude_defaults=True, ) ), ]", + "type": "reference" + }, + "inheritedFrom": { + "name": "Request.user_data", + "target": 404, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Number of times the request has been retried." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3592, + "module": "_request", + "name": "retry_count", + "parsedDocstring": { + "text": "Number of times the request has been retried." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 198 + } + ], + "type": { + "name": "Annotated[int, Field(alias='retryCount')]", + "type": "reference" + }, + "inheritedFrom": { + "name": "Request.retry_count", + "target": 405, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If set to `True`, the request will not be retried in case of failure." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3593, + "module": "_request", + "name": "no_retry", + "parsedDocstring": { + "text": "If set to `True`, the request will not be retried in case of failure." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 201 + } + ], + "type": { + "name": "Annotated[bool, Field(alias='noRetry')]", + "type": "reference" + }, + "inheritedFrom": { + "name": "Request.no_retry", + "target": 406, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "URL of the web page that was loaded. This can differ from the original URL in case of redirects." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3594, + "module": "_request", + "name": "loaded_url", + "parsedDocstring": { + "text": "URL of the web page that was loaded. This can differ from the original URL in case of redirects." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 204 + } + ], + "type": { + "name": "Annotated[str | None, BeforeValidator(validate_http_url), Field(alias='loadedUrl')]", + "type": "reference" + }, + "inheritedFrom": { + "name": "Request.loaded_url", + "target": 407, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Timestamp when the request was handled." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3595, + "module": "_request", + "name": "handled_at", + "parsedDocstring": { + "text": "Timestamp when the request was handled." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 207 + } + ], + "type": { + "name": "Annotated[datetime | None, Field(alias='handledAt')]", + "type": "reference" + }, + "inheritedFrom": { + "name": "Request.handled_at", + "target": 408, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A unique key identifying the request. Two requests with the same `unique_key` are considered as pointing\nto the same URL.\n\nIf `unique_key` is not provided, then it is automatically generated by normalizing the URL.\nFor example, the URL of `HTTP://www.EXAMPLE.com/something/` will produce the `unique_key`\nof `http://www.example.com/something`.\n\nPass an arbitrary non-empty text value to the `unique_key` property\nto override the default behavior and specify which URLs shall be considered equal." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3596, + "module": "_request", + "name": "unique_key", + "parsedDocstring": { + "text": "A unique key identifying the request. Two requests with the same `unique_key` are considered as pointing\nto the same URL.\n\nIf `unique_key` is not provided, then it is automatically generated by normalizing the URL.\nFor example, the URL of `HTTP://www.EXAMPLE.com/something/` will produce the `unique_key`\nof `http://www.example.com/something`.\n\nPass an arbitrary non-empty text value to the `unique_key` property\nto override the default behavior and specify which URLs shall be considered equal." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 210 + } + ], + "type": { + "name": "Annotated[str, Field(alias='uniqueKey')]", + "type": "reference" + }, + "inheritedFrom": { + "name": "Request.unique_key", + "target": 409, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A unique identifier for the request. Note that this is not used for deduplication, and should not be confused\nwith `unique_key`." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3597, + "module": "_request", + "name": "id", + "parsedDocstring": { + "text": "A unique identifier for the request. Note that this is not used for deduplication, and should not be confused\nwith `unique_key`." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 222 + } + ], + "type": { + "name": "str", + "type": "reference" + }, + "inheritedFrom": { + "name": "Request.id", + "target": 410, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a new `Request` instance from a URL.\n\nThis is recommended constructor for creating new `Request` instances. It generates a `Request` object from\na given URL with additional options to customize HTTP method, payload, unique key, and other request\nproperties. If no `unique_key` or `id` is provided, they are computed automatically based on the URL,\nmethod and payload. It depends on the `keep_url_fragment` and `use_extended_unique_key` flags.\n" + } + ] + }, + "decorations": [ + { + "name": "classmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3598, + "module": "_request", + "name": "from_url", + "parsedDocstring": { + "text": "Create a new `Request` instance from a URL.\n\nThis is recommended constructor for creating new `Request` instances. It generates a `Request` object from\na given URL with additional options to customize HTTP method, payload, unique key, and other request\nproperties. If no `unique_key` or `id` is provided, they are computed automatically based on the URL,\nmethod and payload. It depends on the `keep_url_fragment` and `use_extended_unique_key` flags.\n", + "args": { + "url": "The URL of the request.", + "method": "The HTTP method of the request.", + "headers": "The HTTP headers of the request.", + "payload": "The data to be sent as the request body. Typically used with 'POST' or 'PUT' requests.", + "label": "A custom label to differentiate between request types. This is stored in `user_data`, and it is\nused for request routing (different requests go to different handlers).", + "session_id": "ID of a specific `Session` to which the request will be strictly bound.\nIf the session becomes unavailable when the request is processed, a `RequestCollisionError` will be\nraised.", + "unique_key": "A unique key identifying the request. If not provided, it is automatically computed based on\nthe URL and other parameters. Requests with the same `unique_key` are treated as identical.", + "id": "A unique identifier for the request. If not provided, it is automatically generated from the\n`unique_key`.", + "keep_url_fragment": "Determines whether the URL fragment (e.g., ``section``) should be included in\nthe `unique_key` computation. This is only relevant when `unique_key` is not provided.", + "use_extended_unique_key": "Determines whether to include the HTTP method, ID Session and payload in the\n`unique_key` computation. This is only relevant when `unique_key` is not provided.", + "always_enqueue": "If set to `True`, the request will be enqueued even if it is already present in the queue.\nUsing this is not allowed when a custom `unique_key` is also provided and will result in a `ValueError`.", + "**kwargs": "Additional request properties." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 227 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a new `Request` instance from a URL.\n\nThis is recommended constructor for creating new `Request` instances. It generates a `Request` object from\na given URL with additional options to customize HTTP method, payload, unique key, and other request\nproperties. If no `unique_key` or `id` is provided, they are computed automatically based on the URL,\nmethod and payload. It depends on the `keep_url_fragment` and `use_extended_unique_key` flags.\n" + } + ] + }, + "flags": {}, + "id": 412, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "from_url", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The URL of the request." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 413, + "kind": 32768, + "kindString": "Parameter", + "name": "url", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The HTTP method of the request." + } + ] + }, + "defaultValue": "'GET'", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 414, + "kind": 32768, + "kindString": "Parameter", + "name": "method", + "type": { + "name": "HttpMethod", + "type": "reference", + "target": "118" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The HTTP headers of the request." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 415, + "kind": 32768, + "kindString": "Parameter", + "name": "headers", + "type": { + "name": "HttpHeaders | dict[str, str] | None", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The data to be sent as the request body. Typically used with 'POST' or 'PUT' requests." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 416, + "kind": 32768, + "kindString": "Parameter", + "name": "payload", + "type": { + "name": "HttpPayload | str | None", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A custom label to differentiate between request types. This is stored in `user_data`, and it is\nused for request routing (different requests go to different handlers)." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 417, + "kind": 32768, + "kindString": "Parameter", + "name": "label", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "ID of a specific `Session` to which the request will be strictly bound.\nIf the session becomes unavailable when the request is processed, a `RequestCollisionError` will be\nraised." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 418, + "kind": 32768, + "kindString": "Parameter", + "name": "session_id", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A unique key identifying the request. If not provided, it is automatically computed based on\nthe URL and other parameters. Requests with the same `unique_key` are treated as identical." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 419, + "kind": 32768, + "kindString": "Parameter", + "name": "unique_key", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A unique identifier for the request. If not provided, it is automatically generated from the\n`unique_key`." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 420, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Determines whether the URL fragment (e.g., ``section``) should be included in\nthe `unique_key` computation. This is only relevant when `unique_key` is not provided." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 421, + "kind": 32768, + "kindString": "Parameter", + "name": "keep_url_fragment", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Determines whether to include the HTTP method, ID Session and payload in the\n`unique_key` computation. This is only relevant when `unique_key` is not provided." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 422, + "kind": 32768, + "kindString": "Parameter", + "name": "use_extended_unique_key", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "If set to `True`, the request will be enqueued even if it is already present in the queue.\nUsing this is not allowed when a custom `unique_key` is also provided and will result in a `ValueError`." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 423, + "kind": 32768, + "kindString": "Parameter", + "name": "always_enqueue", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 424, + "kind": 32768, + "kindString": "Parameter", + "name": "kwargs", + "type": { + "name": "Any", + "type": "reference" + } + } + ], + "type": { + "name": "Self", + "type": "reference" + }, + "inheritedFrom": { + "name": "Request.from_url", + "target": 411, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "Request.from_url", + "target": 411, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the value of a specific query parameter from the URL." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3599, + "module": "_request", + "name": "get_query_param_from_url", + "parsedDocstring": { + "text": "Get the value of a specific query parameter from the URL." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 314 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the value of a specific query parameter from the URL." + } + ] + }, + "flags": {}, + "id": 426, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "get_query_param_from_url", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 427, + "kind": 32768, + "kindString": "Parameter", + "name": "param", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 428, + "kind": 32768, + "kindString": "Parameter", + "name": "default", + "type": { + "name": "str | None", + "type": "reference" + } + } + ], + "type": { + "name": "str | None", + "type": "reference" + }, + "inheritedFrom": { + "name": "Request.get_query_param_from_url", + "target": 425, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "Request.get_query_param_from_url", + "target": 425, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A string used to differentiate between arbitrary request types." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 3600, + "module": "_request", + "name": "label", + "parsedDocstring": { + "text": "A string used to differentiate between arbitrary request types." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 320 + } + ], + "type": { + "name": "str | None", + "type": "reference" + }, + "inheritedFrom": { + "name": "Request.label", + "target": 429, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The ID of the bound session, if there is any." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 3601, + "module": "_request", + "name": "session_id", + "parsedDocstring": { + "text": "The ID of the bound session, if there is any." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 325 + } + ], + "type": { + "name": "str | None", + "type": "reference" + }, + "inheritedFrom": { + "name": "Request.session_id", + "target": 430, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Crawlee-specific configuration stored in the `user_data`." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 3602, + "module": "_request", + "name": "crawlee_data", + "parsedDocstring": { + "text": "Crawlee-specific configuration stored in the `user_data`." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 330 + } + ], + "type": { + "name": "CrawleeRequestData", + "type": "reference", + "target": "351" + }, + "inheritedFrom": { + "name": "Request.crawlee_data", + "target": 431, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The depth of the request in the crawl tree." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 3603, + "module": "_request", + "name": "crawl_depth", + "parsedDocstring": { + "text": "The depth of the request in the crawl tree." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 339 + } + ], + "type": { + "name": "int", + "type": "reference" + }, + "inheritedFrom": { + "name": "Request.crawl_depth", + "target": 432, + "type": "reference" + }, + "overwrites": { + "name": "Request.crawl_depth", + "target": 433, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Crawlee-specific request handling state." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 3604, + "module": "_request", + "name": "state", + "parsedDocstring": { + "text": "Crawlee-specific request handling state." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 348 + } + ], + "type": { + "name": "RequestState | None", + "type": "reference" + }, + "inheritedFrom": { + "name": "Request.state", + "target": 436, + "type": "reference" + }, + "overwrites": { + "name": "Request.state", + "target": 437, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Crawlee-specific limit on the number of retries of the request." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 3605, + "module": "_request", + "name": "max_retries", + "parsedDocstring": { + "text": "Crawlee-specific limit on the number of retries of the request." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 357 + } + ], + "type": { + "name": "int | None", + "type": "reference" + }, + "inheritedFrom": { + "name": "Request.max_retries", + "target": 440, + "type": "reference" + }, + "overwrites": { + "name": "Request.max_retries", + "target": 441, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Crawlee-specific number of finished session rotations for the request." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 3606, + "module": "_request", + "name": "session_rotation_count", + "parsedDocstring": { + "text": "Crawlee-specific number of finished session rotations for the request." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 366 + } + ], + "type": { + "name": "int | None", + "type": "reference" + }, + "inheritedFrom": { + "name": "Request.session_rotation_count", + "target": 444, + "type": "reference" + }, + "overwrites": { + "name": "Request.session_rotation_count", + "target": 445, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The strategy that was used for enqueuing the request." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 3607, + "module": "_request", + "name": "enqueue_strategy", + "parsedDocstring": { + "text": "The strategy that was used for enqueuing the request." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 375 + } + ], + "type": { + "name": "EnqueueStrategy", + "type": "reference", + "target": "121" + }, + "inheritedFrom": { + "name": "Request.enqueue_strategy", + "target": 448, + "type": "reference" + }, + "overwrites": { + "name": "Request.enqueue_strategy", + "target": 449, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The last proxy tier used to process the request." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 3608, + "module": "_request", + "name": "last_proxy_tier", + "parsedDocstring": { + "text": "The last proxy tier used to process the request." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 384 + } + ], + "type": { + "name": "int | None", + "type": "reference" + }, + "inheritedFrom": { + "name": "Request.last_proxy_tier", + "target": 452, + "type": "reference" + }, + "overwrites": { + "name": "Request.last_proxy_tier", + "target": 453, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Indicate whether the request should be enqueued at the front of the queue." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 3609, + "module": "_request", + "name": "forefront", + "parsedDocstring": { + "text": "Indicate whether the request should be enqueued at the front of the queue." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 393 + } + ], + "type": { + "name": "bool", + "type": "reference" + }, + "inheritedFrom": { + "name": "Request.forefront", + "target": 456, + "type": "reference" + }, + "overwrites": { + "name": "Request.forefront", + "target": 457, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A crawling request with information about locks." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 3598, + 3599 + ], + "title": "Methods" + }, + { + "children": [ + 3603, + 3602, + 3607, + 3609, + 3595, + 3589, + 3597, + 3600, + 3608, + 3594, + 461, + 3605, + 3588, + 3586, + 3593, + 3590, + 3592, + 3601, + 3606, + 3604, + 3596, + 3587, + 3591 + ], + "title": "Properties" + } + ], + "id": 460, + "module": "_request", + "name": "RequestWithLock", + "parsedDocstring": { + "text": "A crawling request with information about locks." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 402 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "Request", + "target": "398", + "type": "reference" + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 462, + "module": "_log_config", + "name": "get_configured_log_level", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_log_config.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 37 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 463, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "get_configured_log_level", + "parameters": [], + "type": { + "name": "int", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 464, + "module": "_log_config", + "name": "configure_logger", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_log_config.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 60 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 465, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "configure_logger", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 466, + "kind": 32768, + "kindString": "Parameter", + "name": "logger", + "type": { + "name": "logging.Logger", + "type": "reference" + } + }, + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 467, + "kind": 32768, + "kindString": "Parameter", + "name": "remove_old_handlers", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 469, + "module": "_log_config", + "name": "empty_record", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_log_config.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 89 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 470, + "module": "_log_config", + "name": "__init__", + "parsedDocstring": { + "text": "Initialize a new instance.\n", + "args": { + "include_logger_name": "Include logger name at the beginning of the log line.", + "args": "Arguments passed to the parent class.", + "kwargs": "Keyword arguments passed to the parent class." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_log_config.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 91 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n" + } + ] + }, + "flags": {}, + "id": 471, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Include logger name at the beginning of the log line." + } + ] + }, + "defaultValue": "True", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 472, + "kind": 32768, + "kindString": "Parameter", + "name": "include_logger_name", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Arguments passed to the parent class." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 473, + "kind": 32768, + "kindString": "Parameter", + "name": "args", + "type": { + "name": "Any", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Keyword arguments passed to the parent class." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 474, + "kind": 32768, + "kindString": "Parameter", + "name": "kwargs", + "type": { + "name": "Any", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Format the log record nicely.\n\nThis formats the log record so that it:\n- starts with the level (colorized, and padded to 5 chars so that it is nicely aligned)\n- then has the actual log message, if it's multiline then it's nicely indented\n- then has the stringified extra log fields\n- then, if an exception is a part of the log record, prints the formatted exception." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 475, + "module": "_log_config", + "name": "format", + "parsedDocstring": { + "text": "Format the log record nicely.\n\nThis formats the log record so that it:\n- starts with the level (colorized, and padded to 5 chars so that it is nicely aligned)\n- then has the actual log message, if it's multiline then it's nicely indented\n- then has the stringified extra log fields\n- then, if an exception is a part of the log record, prints the formatted exception." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_log_config.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 115 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Format the log record nicely.\n\nThis formats the log record so that it:\n- starts with the level (colorized, and padded to 5 chars so that it is nicely aligned)\n- then has the actual log message, if it's multiline then it's nicely indented\n- then has the stringified extra log fields\n- then, if an exception is a part of the log record, prints the formatted exception." + } + ] + }, + "flags": {}, + "id": 476, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "format", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 477, + "kind": 32768, + "kindString": "Parameter", + "name": "record", + "type": { + "name": "logging.LogRecord", + "type": "reference" + } + } + ], + "type": { + "name": "str", + "type": "reference" + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Log formatter that prints out the log message nicely formatted, with colored level and stringified extra fields.\n\nIt formats the log records so that they:\n- start with the level (colorized, and padded to 5 chars so that it is nicely aligned)\n- then have the actual log message, if it's multiline then it's nicely indented\n- then have the stringified extra log fields\n- then, if an exception is a part of the log record, prints the formatted exception." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 470, + 475 + ], + "title": "Methods" + }, + { + "children": [ + 469 + ], + "title": "Properties" + } + ], + "id": 468, + "module": "_log_config", + "name": "CrawleeLogFormatter", + "parsedDocstring": { + "text": "Log formatter that prints out the log message nicely formatted, with colored level and stringified extra fields.\n\nIt formats the log records so that they:\n- start with the level (colorized, and padded to 5 chars so that it is nicely aligned)\n- then have the actual log message, if it's multiline then it's nicely indented\n- then have the stringified extra log fields\n- then, if an exception is a part of the log record, prints the formatted exception." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_log_config.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 75 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 478, + "module": "_consts", + "name": "METADATA_FILENAME", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_consts.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 3 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 479, + "module": "_cli", + "name": "cli", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_cli.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 22 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 480, + "module": "_cli", + "name": "template_directory", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_cli.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 24 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 481, + "module": "_cli", + "name": "crawler_choices", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_cli.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 28 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 482, + "module": "_cli", + "name": "http_client_choices", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_cli.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 29 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 483, + "module": "_cli", + "name": "package_manager_choices", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_cli.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 30 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 484, + "module": "_cli", + "name": "default_start_url", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_cli.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 31 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Crawlee is a web scraping and browser automation library." + } + ] + }, + "decorations": [ + { + "args": ".callback(invoke_without_command=True)", + "name": "cli" + } + ], + "flags": {}, + "groups": [], + "id": 485, + "module": "_cli", + "name": "callback", + "parsedDocstring": { + "text": "Crawlee is a web scraping and browser automation library." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_cli.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 35 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Crawlee is a web scraping and browser automation library." + } + ] + }, + "flags": {}, + "id": 486, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "callback", + "parameters": [ + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 487, + "kind": 32768, + "kindString": "Parameter", + "name": "version", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Bootstrap a new Crawlee project." + } + ] + }, + "decorations": [ + { + "args": ".command()", + "name": "cli" + } + ], + "flags": {}, + "groups": [], + "id": 488, + "module": "_cli", + "name": "create", + "parsedDocstring": { + "text": "Bootstrap a new Crawlee project." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_cli.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 122 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Bootstrap a new Crawlee project." + } + ] + }, + "flags": {}, + "id": 489, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "create", + "parameters": [ + { + "defaultValue": "typer.Argument(\n default=None,\n show_default=False,\n help='The name of the project and the directory that will be created to contain it. '\n 'If none is given, you will be prompted.',\n )", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 490, + "kind": 32768, + "kindString": "Parameter", + "name": "project_name", + "type": { + "name": "Optional", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "defaultValue": "typer.Option(\n None,\n '--crawler-type',\n '--template',\n show_default=False,\n help='The library that will be used for crawling in your crawler. If none is given, you will be prompted.',\n )", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 491, + "kind": 32768, + "kindString": "Parameter", + "name": "crawler_type", + "type": { + "name": "Optional", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "defaultValue": "typer.Option(\n None,\n show_default=False,\n help='The library that will be used to make HTTP requests in your crawler. '\n 'If none is given, you will be prompted.',\n )", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 492, + "kind": 32768, + "kindString": "Parameter", + "name": "http_client", + "type": { + "name": "Optional", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "defaultValue": "typer.Option(\n default=None,\n show_default=False,\n help='Package manager to be used in the new project. If none is given, you will be prompted.',\n )", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 493, + "kind": 32768, + "kindString": "Parameter", + "name": "package_manager", + "type": { + "name": "Optional", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "defaultValue": "typer.Option(\n default=None,\n show_default=False,\n help='The URL where crawling should start. If none is given, you will be prompted.',\n )", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 494, + "kind": 32768, + "kindString": "Parameter", + "name": "start_url", + "type": { + "name": "Optional", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "defaultValue": "typer.Option(\n None,\n '--apify/--no-apify',\n show_default=False,\n help='Should Apify integration be set up for you? If not given, you will be prompted.',\n )", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 495, + "kind": 32768, + "kindString": "Parameter", + "name": "enable_apify_integration", + "type": { + "name": "Optional", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Patches `browserforge` to use data from `apify_fingerprint_datapoints`.\n\nThis avoids import time or runtime file downloads." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 496, + "module": "_browserforge_workaround", + "name": "patch_browserforge", + "parsedDocstring": { + "text": "Patches `browserforge` to use data from `apify_fingerprint_datapoints`.\n\nThis avoids import time or runtime file downloads." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_browserforge_workaround.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 4 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Patches `browserforge` to use data from `apify_fingerprint_datapoints`.\n\nThis avoids import time or runtime file downloads." + } + ] + }, + "flags": {}, + "id": 497, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "patch_browserforge", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 498, + "module": "__init__", + "name": "__version__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/__init__.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 8 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 499, + "module": "storages._request_queue", + "name": "logger", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_request_queue.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 31 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 500, + "module": "storages._request_queue", + "name": "T", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_request_queue.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 33 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 502, + "module": "storages._request_queue", + "name": "id", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_request_queue.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 37 + } + ], + "type": { + "name": "str", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 503, + "module": "storages._request_queue", + "name": "was_already_handled", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_request_queue.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 38 + } + ], + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 504, + "module": "storages._request_queue", + "name": "hydrated", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_request_queue.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 39 + } + ], + "type": { + "name": "Request | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Request", + "target": "398" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 505, + "module": "storages._request_queue", + "name": "lock_expires_at", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_request_queue.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 40 + } + ], + "type": { + "name": "datetime | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "datetime" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 506, + "module": "storages._request_queue", + "name": "forefront", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_request_queue.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 41 + } + ], + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 506, + 504, + 502, + 505, + 503 + ], + "title": "Properties" + } + ], + "id": 501, + "module": "storages._request_queue", + "name": "CachedRequest", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_request_queue.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 36 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 508, + "module": "storages._request_queue", + "name": "__init__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_request_queue.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 80 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 509, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 510, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 511, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 512, + "kind": 32768, + "kindString": "Parameter", + "name": "storage_client", + "type": { + "name": "StorageClient", + "type": "reference", + "target": "1189" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance of RequestQueue from a storage metadata object." + } + ] + }, + "decorations": [ + { + "name": "classmethod" + } + ], + "flags": {}, + "groups": [], + "id": 513, + "module": "storages._request_queue", + "name": "from_storage_object", + "parsedDocstring": { + "text": "Initialize a new instance of RequestQueue from a storage metadata object." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_request_queue.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 125 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance of RequestQueue from a storage metadata object." + } + ] + }, + "flags": {}, + "id": 514, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "from_storage_object", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 515, + "kind": 32768, + "kindString": "Parameter", + "name": "storage_client", + "type": { + "name": "StorageClient", + "type": "reference", + "target": "1189" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 516, + "kind": 32768, + "kindString": "Parameter", + "name": "storage_object", + "type": { + "name": "StorageMetadata", + "type": "reference", + "target": "750" + } + } + ], + "type": { + "name": "RequestQueue", + "type": "reference", + "target": "507" + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the storage ID." + } + ] + }, + "decorations": [ + { + "name": "property" + }, + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 517, + "module": "storages._base", + "name": "id", + "parsedDocstring": { + "text": "Get the storage ID." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_request_queue.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 138 + } + ], + "type": { + "name": "str", + "type": "reference" + }, + "overwrites": { + "name": "Storage.id", + "target": 735, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the storage name." + } + ] + }, + "decorations": [ + { + "name": "property" + }, + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 518, + "module": "storages._base", + "name": "name", + "parsedDocstring": { + "text": "Get the storage name." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_request_queue.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 143 + } + ], + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + }, + "overwrites": { + "name": "Storage.name", + "target": 736, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the full storage object." + } + ] + }, + "decorations": [ + { + "name": "property" + }, + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 519, + "module": "storages._base", + "name": "storage_object", + "parsedDocstring": { + "text": "Get the full storage object." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_request_queue.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 148 + } + ], + "type": { + "name": "StorageMetadata", + "type": "reference", + "target": "750" + }, + "overwrites": { + "name": "Storage.storage_object", + "target": 738, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "args": ".setter", + "name": "storage_object" + }, + { + "name": "override" + } + ], + "flags": {}, + "groups": [], + "id": 520, + "module": "storages._request_queue", + "name": "storage_object", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_request_queue.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 153 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 521, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "storage_object", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 522, + "kind": 32768, + "kindString": "Parameter", + "name": "storage_object", + "type": { + "name": "StorageMetadata", + "type": "reference", + "target": "750" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Open a storage, either restore existing or create a new one.\n" + } + ] + }, + "decorations": [ + { + "name": "classmethod" + }, + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 523, + "module": "storages._base", + "name": "open", + "parsedDocstring": { + "text": "Open a storage, either restore existing or create a new one.\n", + "args": { + "id": "The storage ID.", + "name": "The storage name.", + "configuration": "Configuration object used during the storage creation or restoration process.", + "storage_client": "Underlying storage client to use. If not provided, the default global storage client\nfrom the service locator will be used." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_request_queue.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 158 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Open a storage, either restore existing or create a new one.\n" + } + ] + }, + "flags": {}, + "id": 742, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "open", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The storage ID." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 743, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The storage name." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 744, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Configuration object used during the storage creation or restoration process." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 745, + "kind": 32768, + "kindString": "Parameter", + "name": "configuration", + "type": { + "name": "Configuration | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Configuration", + "target": "93" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Underlying storage client to use. If not provided, the default global storage client\nfrom the service locator will be used." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 746, + "kind": 32768, + "kindString": "Parameter", + "name": "storage_client", + "type": { + "name": "StorageClient | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "StorageClient", + "target": "1189" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "Storage", + "type": "reference", + "target": "734" + }, + "overwrites": { + "name": "Storage.open", + "target": 741, + "type": "reference" + } + } + ], + "overwrites": { + "name": "Storage.open", + "target": 741, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Remove persistent state either from the Apify Cloud storage or from the local database." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 529, + "module": "request_loaders._request_manager", + "name": "drop", + "parsedDocstring": { + "text": "Remove persistent state either from the Apify Cloud storage or from the local database." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_request_queue.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 180 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Remove persistent state either from the Apify Cloud storage or from the local database." + } + ] + }, + "flags": {}, + "id": 1754, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "drop", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "Storage.drop", + "target": 747, + "type": "reference" + } + } + ], + "overwrites": { + "name": "Storage.drop", + "target": 747, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Add a single request to the manager and store it in underlying resource client.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 532, + "module": "request_loaders._request_manager", + "name": "add_request", + "parsedDocstring": { + "text": "Add a single request to the manager and store it in underlying resource client.\n", + "args": { + "request": "The request object (or its string representation) to be added to the manager.", + "forefront": "Determines whether the request should be added to the beginning (if True) or the end (if False)\nof the manager.\n" + }, + "returns": "Information about the request addition to the manager." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_request_queue.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 191 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "Information about the request addition to the manager." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Add a single request to the manager and store it in underlying resource client.\n" + } + ] + }, + "flags": {}, + "id": 1756, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "add_request", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The request object (or its string representation) to be added to the manager." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1757, + "kind": 32768, + "kindString": "Parameter", + "name": "request", + "type": { + "name": "str | Request", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Request", + "target": "398" + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Determines whether the request should be added to the beginning (if True) or the end (if False)\nof the manager.\n" + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1758, + "kind": 32768, + "kindString": "Parameter", + "name": "forefront", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "ProcessedRequest", + "type": "reference", + "target": "828" + }, + "overwrites": { + "name": "RequestManager.add_request", + "target": 1755, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RequestManager.add_request", + "target": 1755, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Add requests to the manager in batches.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 536, + "module": "request_loaders._request_manager", + "name": "add_requests_batched", + "parsedDocstring": { + "text": "Add requests to the manager in batches.\n", + "args": { + "requests": "Requests to enqueue.", + "batch_size": "The number of requests to add in one batch.", + "wait_time_between_batches": "Time to wait between adding batches.", + "wait_for_all_requests_to_be_added": "If True, wait for all requests to be added before returning.", + "wait_for_all_requests_to_be_added_timeout": "Timeout for waiting for all requests to be added." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_request_queue.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 228 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Add requests to the manager in batches.\n" + } + ] + }, + "flags": {}, + "id": 1760, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "add_requests_batched", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Requests to enqueue." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1761, + "kind": 32768, + "kindString": "Parameter", + "name": "requests", + "type": { + "name": "Sequence", + "type": "reference", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Request", + "target": "398" + } + ] + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The number of requests to add in one batch." + } + ] + }, + "defaultValue": "1000", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1762, + "kind": 32768, + "kindString": "Parameter", + "name": "batch_size", + "type": { + "name": "int", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Time to wait between adding batches." + } + ] + }, + "defaultValue": "timedelta(seconds=1)", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1763, + "kind": 32768, + "kindString": "Parameter", + "name": "wait_time_between_batches", + "type": { + "name": "timedelta", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, wait for all requests to be added before returning." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1764, + "kind": 32768, + "kindString": "Parameter", + "name": "wait_for_all_requests_to_be_added", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Timeout for waiting for all requests to be added." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1765, + "kind": 32768, + "kindString": "Parameter", + "name": "wait_for_all_requests_to_be_added_timeout", + "type": { + "name": "timedelta | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "timedelta" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "RequestManager.add_requests_batched", + "target": 1759, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RequestManager.add_requests_batched", + "target": 1759, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Retrieve a request from the queue.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 543, + "module": "storages._request_queue", + "name": "get_request", + "parsedDocstring": { + "text": "Retrieve a request from the queue.\n", + "args": { + "request_id": "ID of the request to retrieve.\n" + }, + "returns": "The retrieved request, or `None`, if it does not exist." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_request_queue.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 292 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "The retrieved request, or `None`, if it does not exist." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Retrieve a request from the queue.\n" + } + ] + }, + "flags": {}, + "id": 544, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_request", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "ID of the request to retrieve.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 545, + "kind": 32768, + "kindString": "Parameter", + "name": "request_id", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "Request | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Request", + "target": "398" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the next request in the queue to be processed.\n\nOnce you successfully finish processing of the request, you need to call `RequestQueue.mark_request_as_handled`\nto mark the request as handled in the queue. If there was some error in processing the request, call\n`RequestQueue.reclaim_request` instead, so that the queue will give the request to some other consumer\nin another call to the `fetch_next_request` method.\n\nNote that the `None` return value does not mean the queue processing finished, it means there are currently\nno pending requests. To check whether all requests in queue were finished, use `RequestQueue.is_finished`\ninstead.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 546, + "module": "storages._request_queue", + "name": "fetch_next_request", + "parsedDocstring": { + "text": "Return the next request in the queue to be processed.\n\nOnce you successfully finish processing of the request, you need to call `RequestQueue.mark_request_as_handled`\nto mark the request as handled in the queue. If there was some error in processing the request, call\n`RequestQueue.reclaim_request` instead, so that the queue will give the request to some other consumer\nin another call to the `fetch_next_request` method.\n\nNote that the `None` return value does not mean the queue processing finished, it means there are currently\nno pending requests. To check whether all requests in queue were finished, use `RequestQueue.is_finished`\ninstead.\n", + "returns": "The request or `None` if there are no more pending requests." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_request_queue.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 303 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "The request or `None` if there are no more pending requests." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Return the next request in the queue to be processed.\n\nOnce you successfully finish processing of the request, you need to call `RequestQueue.mark_request_as_handled`\nto mark the request as handled in the queue. If there was some error in processing the request, call\n`RequestQueue.reclaim_request` instead, so that the queue will give the request to some other consumer\nin another call to the `fetch_next_request` method.\n\nNote that the `None` return value does not mean the queue processing finished, it means there are currently\nno pending requests. To check whether all requests in queue were finished, use `RequestQueue.is_finished`\ninstead.\n" + } + ] + }, + "flags": {}, + "id": 547, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "fetch_next_request", + "parameters": [], + "type": { + "name": "Request | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Request", + "target": "398" + }, + { + "type": "literal", + "value": null + } + ] + }, + "overwrites": { + "name": "RequestManager.fetch_next_request", + "target": 3534, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RequestManager.fetch_next_request", + "target": 3534, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Mark a request as handled after successful processing.\n\nHandled requests will never again be returned by the `RequestQueue.fetch_next_request` method.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 548, + "module": "storages._request_queue", + "name": "mark_request_as_handled", + "parsedDocstring": { + "text": "Mark a request as handled after successful processing.\n\nHandled requests will never again be returned by the `RequestQueue.fetch_next_request` method.\n", + "args": { + "request": "The request to mark as handled.\n" + }, + "returns": "Information about the queue operation. `None` if the given request was not in progress." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_request_queue.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 360 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "Information about the queue operation. `None` if the given request was not in progress." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Mark a request as handled after successful processing.\n\nHandled requests will never again be returned by the `RequestQueue.fetch_next_request` method.\n" + } + ] + }, + "flags": {}, + "id": 549, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "mark_request_as_handled", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The request to mark as handled.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 550, + "kind": 32768, + "kindString": "Parameter", + "name": "request", + "type": { + "name": "Request", + "type": "reference", + "target": "398" + } + } + ], + "type": { + "name": "ProcessedRequest | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ProcessedRequest", + "target": "828" + }, + { + "type": "literal", + "value": null + } + ] + }, + "overwrites": { + "name": "RequestManager.mark_request_as_handled", + "target": 3535, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RequestManager.mark_request_as_handled", + "target": 3535, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Reclaim a failed request back to the queue.\n\nThe request will be returned for processing later again by another call to `RequestQueue.fetch_next_request`.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 551, + "module": "storages._request_queue", + "name": "reclaim_request", + "parsedDocstring": { + "text": "Reclaim a failed request back to the queue.\n\nThe request will be returned for processing later again by another call to `RequestQueue.fetch_next_request`.\n", + "args": { + "request": "The request to return to the queue.", + "forefront": "Whether to add the request to the head or the end of the queue.\n" + }, + "returns": "Information about the queue operation. `None` if the given request was not in progress." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_request_queue.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 386 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "Information about the queue operation. `None` if the given request was not in progress." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Reclaim a failed request back to the queue.\n\nThe request will be returned for processing later again by another call to `RequestQueue.fetch_next_request`.\n" + } + ] + }, + "flags": {}, + "id": 552, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "reclaim_request", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The request to return to the queue." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 553, + "kind": 32768, + "kindString": "Parameter", + "name": "request", + "type": { + "name": "Request", + "type": "reference", + "target": "398" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Whether to add the request to the head or the end of the queue.\n" + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 554, + "kind": 32768, + "kindString": "Parameter", + "name": "forefront", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "ProcessedRequest | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ProcessedRequest", + "target": "828" + }, + { + "type": "literal", + "value": null + } + ] + }, + "overwrites": { + "name": "RequestManager.reclaim_request", + "target": 1766, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RequestManager.reclaim_request", + "target": 1766, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Check whether the queue is empty.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 555, + "module": "storages._request_queue", + "name": "is_empty", + "parsedDocstring": { + "text": "Check whether the queue is empty.\n", + "returns": "bool: `True` if the next call to `RequestQueue.fetch_next_request` would return `None`, otherwise `False`." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_request_queue.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 421 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "bool: `True` if the next call to `RequestQueue.fetch_next_request` would return `None`, otherwise `False`." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Check whether the queue is empty.\n" + } + ] + }, + "flags": {}, + "id": 556, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "is_empty", + "parameters": [], + "type": { + "name": "bool", + "type": "reference" + }, + "overwrites": { + "name": "RequestManager.is_empty", + "target": 3532, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RequestManager.is_empty", + "target": 3532, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Check whether the queue is finished.\n\nDue to the nature of distributed storage used by the queue, the function might occasionally return a false\nnegative, but it will never return a false positive.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 557, + "module": "storages._request_queue", + "name": "is_finished", + "parsedDocstring": { + "text": "Check whether the queue is finished.\n\nDue to the nature of distributed storage used by the queue, the function might occasionally return a false\nnegative, but it will never return a false positive.\n", + "returns": "bool: `True` if all requests were already handled and there are no more left. `False` otherwise." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_request_queue.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 430 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "bool: `True` if all requests were already handled and there are no more left. `False` otherwise." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Check whether the queue is finished.\n\nDue to the nature of distributed storage used by the queue, the function might occasionally return a false\nnegative, but it will never return a false positive.\n" + } + ] + }, + "flags": {}, + "id": 558, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "is_finished", + "parameters": [], + "type": { + "name": "bool", + "type": "reference" + }, + "overwrites": { + "name": "RequestManager.is_finished", + "target": 3533, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RequestManager.is_finished", + "target": 3533, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get an object containing general information about the request queue." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 559, + "module": "storages._request_queue", + "name": "get_info", + "parsedDocstring": { + "text": "Get an object containing general information about the request queue." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_request_queue.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 489 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get an object containing general information about the request queue." + } + ] + }, + "flags": {}, + "id": 560, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_info", + "parameters": [], + "type": { + "name": "RequestQueueMetadata | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "RequestQueueMetadata", + "target": "763" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the number of handled requests." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 561, + "module": "request_loaders._request_loader", + "name": "get_handled_count", + "parsedDocstring": { + "text": "Return the number of handled requests." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_request_queue.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 494 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the number of handled requests." + } + ] + }, + "flags": {}, + "id": 1783, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_handled_count", + "parameters": [], + "type": { + "name": "int", + "type": "reference" + }, + "overwrites": { + "name": "RequestManager.get_handled_count", + "target": 3536, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RequestManager.get_handled_count", + "target": 3536, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return an offline approximation of the total number of requests in the source (i.e. pending + handled)." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 563, + "module": "request_loaders._request_loader", + "name": "get_total_count", + "parsedDocstring": { + "text": "Return an offline approximation of the total number of requests in the source (i.e. pending + handled)." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_request_queue.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 498 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return an offline approximation of the total number of requests in the source (i.e. pending + handled)." + } + ] + }, + "flags": {}, + "id": 1772, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_total_count", + "parameters": [], + "type": { + "name": "int", + "type": "reference" + }, + "overwrites": { + "name": "RequestManager.get_total_count", + "target": 3531, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RequestManager.get_total_count", + "target": 3531, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Combine the loader with a request manager to support adding and reclaiming requests.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3584, + "module": "request_loaders._request_loader", + "name": "to_tandem", + "parsedDocstring": { + "text": "Combine the loader with a request manager to support adding and reclaiming requests.\n", + "args": { + "request_manager": "Request manager to combine the loader with.\nIf None is given, the default request queue is used." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/request_loaders/_request_loader.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 52 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Combine the loader with a request manager to support adding and reclaiming requests.\n" + } + ] + }, + "flags": {}, + "id": 1785, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "to_tandem", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Request manager to combine the loader with.\nIf None is given, the default request queue is used." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1786, + "kind": 32768, + "kindString": "Parameter", + "name": "request_manager", + "type": { + "name": "RequestManager | None", + "type": "reference" + } + } + ], + "type": { + "name": "RequestManagerTandem", + "type": "reference", + "target": "1717" + }, + "inheritedFrom": { + "name": "RequestLoader.to_tandem", + "target": 1784, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "RequestLoader.to_tandem", + "target": 1784, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Represents a queue storage for managing HTTP requests in web crawling operations.\n\nThe `RequestQueue` class handles a queue of HTTP requests, each identified by a unique URL, to facilitate structured\nweb crawling. It supports both breadth-first and depth-first crawling strategies, allowing for recursive crawling\nstarting from an initial set of URLs. Each URL in the queue is uniquely identified by a `unique_key`, which can be\ncustomized to allow the same URL to be added multiple times under different keys.\n\nData can be stored either locally or in the cloud. It depends on the setup of underlying storage client.\nBy default a `MemoryStorageClient` is used, but it can be changed to a different one.\n\nBy default, data is stored using the following path structure:\n```\n{CRAWLEE_STORAGE_DIR}/request_queues/{QUEUE_ID}/{REQUEST_ID}.json\n```\n\n- `{CRAWLEE_STORAGE_DIR}`: The root directory for all storage data specified by the environment variable.\n- `{QUEUE_ID}`: The identifier for the request queue, either \"default\" or as specified.\n- `{REQUEST_ID}`: The unique identifier for each request in the queue.\n\nThe `RequestQueue` supports both creating new queues and opening existing ones by `id` or `name`. Named queues\npersist indefinitely, while unnamed queues expire after 7 days unless specified otherwise. The queue supports\nmutable operations, allowing URLs to be added and removed as needed.\n\n### Usage\n\n```python\nfrom crawlee.storages import RequestQueue\n\nrq = await RequestQueue.open(name='my_rq')\n```" + } + ] + }, + "decorations": [ + { + "args": "('Classes')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 508, + 532, + 536, + 529, + 546, + 513, + 561, + 559, + 543, + 563, + 555, + 557, + 548, + 523, + 551, + 520, + 3584 + ], + "title": "Methods" + }, + { + "children": [ + 517, + 518, + 519 + ], + "title": "Properties" + } + ], + "id": 507, + "module": "storages._request_queue", + "name": "RequestQueue", + "parsedDocstring": { + "text": "Represents a queue storage for managing HTTP requests in web crawling operations.\n\nThe `RequestQueue` class handles a queue of HTTP requests, each identified by a unique URL, to facilitate structured\nweb crawling. It supports both breadth-first and depth-first crawling strategies, allowing for recursive crawling\nstarting from an initial set of URLs. Each URL in the queue is uniquely identified by a `unique_key`, which can be\ncustomized to allow the same URL to be added multiple times under different keys.\n\nData can be stored either locally or in the cloud. It depends on the setup of underlying storage client.\nBy default a `MemoryStorageClient` is used, but it can be changed to a different one.\n\nBy default, data is stored using the following path structure:\n```\n{CRAWLEE_STORAGE_DIR}/request_queues/{QUEUE_ID}/{REQUEST_ID}.json\n```\n\n- `{CRAWLEE_STORAGE_DIR}`: The root directory for all storage data specified by the environment variable.\n- `{QUEUE_ID}`: The identifier for the request queue, either \"default\" or as specified.\n- `{REQUEST_ID}`: The unique identifier for each request in the queue.\n\nThe `RequestQueue` supports both creating new queues and opening existing ones by `id` or `name`. Named queues\npersist indefinitely, while unnamed queues expire after 7 days unless specified otherwise. The queue supports\nmutable operations, allowing URLs to be added and removed as needed.\n\n### Usage\n\n```python\nfrom crawlee.storages import RequestQueue\n\nrq = await RequestQueue.open(name='my_rq')\n```" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_request_queue.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 45 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "RequestManager", + "target": "1752", + "type": "reference" + }, + { + "name": "Storage", + "target": "734", + "type": "reference" + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 565, + "module": "storages._key_value_store", + "name": "T", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_key_value_store.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 26 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 566, + "module": "storages._key_value_store", + "name": "logger", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_key_value_store.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 28 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 568, + "module": "storages._key_value_store", + "name": "root", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_key_value_store.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 32 + } + ], + "type": { + "name": "dict", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "JsonSerializable" + } + ] + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 568 + ], + "title": "Properties" + } + ], + "id": 567, + "module": "storages._key_value_store", + "name": "AutosavedValue", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_key_value_store.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 31 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 570, + "module": "storages._key_value_store", + "name": "__init__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_key_value_store.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 78 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 571, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 572, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 573, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 574, + "kind": 32768, + "kindString": "Parameter", + "name": "storage_client", + "type": { + "name": "StorageClient", + "type": "reference", + "target": "1189" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance of KeyValueStore from a storage metadata object." + } + ] + }, + "decorations": [ + { + "name": "classmethod" + } + ], + "flags": {}, + "groups": [], + "id": 575, + "module": "storages._key_value_store", + "name": "from_storage_object", + "parsedDocstring": { + "text": "Initialize a new instance of KeyValueStore from a storage metadata object." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_key_value_store.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 91 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance of KeyValueStore from a storage metadata object." + } + ] + }, + "flags": {}, + "id": 576, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "from_storage_object", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 577, + "kind": 32768, + "kindString": "Parameter", + "name": "storage_client", + "type": { + "name": "StorageClient", + "type": "reference", + "target": "1189" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 578, + "kind": 32768, + "kindString": "Parameter", + "name": "storage_object", + "type": { + "name": "StorageMetadata", + "type": "reference", + "target": "750" + } + } + ], + "type": { + "name": "KeyValueStore", + "type": "reference", + "target": "569" + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the storage ID." + } + ] + }, + "decorations": [ + { + "name": "property" + }, + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 579, + "module": "storages._base", + "name": "id", + "parsedDocstring": { + "text": "Get the storage ID." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_key_value_store.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 104 + } + ], + "type": { + "name": "str", + "type": "reference" + }, + "overwrites": { + "name": "Storage.id", + "target": 735, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the storage name." + } + ] + }, + "decorations": [ + { + "name": "property" + }, + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 580, + "module": "storages._base", + "name": "name", + "parsedDocstring": { + "text": "Get the storage name." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_key_value_store.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 109 + } + ], + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + }, + "overwrites": { + "name": "Storage.name", + "target": 736, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the full storage object." + } + ] + }, + "decorations": [ + { + "name": "property" + }, + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 581, + "module": "storages._base", + "name": "storage_object", + "parsedDocstring": { + "text": "Get the full storage object." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_key_value_store.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 114 + } + ], + "type": { + "name": "StorageMetadata", + "type": "reference", + "target": "750" + }, + "overwrites": { + "name": "Storage.storage_object", + "target": 738, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "args": ".setter", + "name": "storage_object" + }, + { + "name": "override" + } + ], + "flags": {}, + "groups": [], + "id": 582, + "module": "storages._key_value_store", + "name": "storage_object", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_key_value_store.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 119 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 583, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "storage_object", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 584, + "kind": 32768, + "kindString": "Parameter", + "name": "storage_object", + "type": { + "name": "StorageMetadata", + "type": "reference", + "target": "750" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get an object containing general information about the key value store." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 585, + "module": "storages._key_value_store", + "name": "get_info", + "parsedDocstring": { + "text": "Get an object containing general information about the key value store." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_key_value_store.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 122 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get an object containing general information about the key value store." + } + ] + }, + "flags": {}, + "id": 586, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_info", + "parameters": [], + "type": { + "name": "KeyValueStoreMetadata | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "KeyValueStoreMetadata", + "target": "760" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Open a storage, either restore existing or create a new one.\n" + } + ] + }, + "decorations": [ + { + "name": "classmethod" + }, + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 587, + "module": "storages._base", + "name": "open", + "parsedDocstring": { + "text": "Open a storage, either restore existing or create a new one.\n", + "args": { + "id": "The storage ID.", + "name": "The storage name.", + "configuration": "Configuration object used during the storage creation or restoration process.", + "storage_client": "Underlying storage client to use. If not provided, the default global storage client\nfrom the service locator will be used." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_key_value_store.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 128 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Open a storage, either restore existing or create a new one.\n" + } + ] + }, + "flags": {}, + "id": 742, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "open", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The storage ID." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 743, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The storage name." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 744, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Configuration object used during the storage creation or restoration process." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 745, + "kind": 32768, + "kindString": "Parameter", + "name": "configuration", + "type": { + "name": "Configuration | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Configuration", + "target": "93" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Underlying storage client to use. If not provided, the default global storage client\nfrom the service locator will be used." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 746, + "kind": 32768, + "kindString": "Parameter", + "name": "storage_client", + "type": { + "name": "StorageClient | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "StorageClient", + "target": "1189" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "Storage", + "type": "reference", + "target": "734" + }, + "overwrites": { + "name": "Storage.open", + "target": 741, + "type": "reference" + } + } + ], + "overwrites": { + "name": "Storage.open", + "target": 741, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Drop the storage, removing it from the underlying storage client and clearing the cache." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 593, + "module": "storages._base", + "name": "drop", + "parsedDocstring": { + "text": "Drop the storage, removing it from the underlying storage client and clearing the cache." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_key_value_store.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 150 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Drop the storage, removing it from the underlying storage client and clearing the cache." + } + ] + }, + "flags": {}, + "id": 748, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "drop", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "Storage.drop", + "target": 747, + "type": "reference" + } + } + ], + "overwrites": { + "name": "Storage.drop", + "target": 747, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get a value from the KVS.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 595, + "module": "storages._key_value_store", + "name": "get_value", + "parsedDocstring": { + "text": "Get a value from the KVS.\n", + "args": { + "key": "Key of the record to retrieve.", + "default_value": "Default value returned in case the record does not exist.\n" + }, + "returns": "The value associated with the given key. `default_value` is used in case the record does not exist." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_key_value_store.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 166 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "The value associated with the given key. `default_value` is used in case the record does not exist." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Get a value from the KVS.\n" + } + ] + }, + "flags": {}, + "id": 596, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_value", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Key of the record to retrieve." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 597, + "kind": 32768, + "kindString": "Parameter", + "name": "key", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Default value returned in case the record does not exist.\n" + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 598, + "kind": 32768, + "kindString": "Parameter", + "name": "default_value", + "type": { + "name": "T | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "T", + "target": "117" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "T | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "T", + "target": "117" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "The value associated with the given key. `default_value` is used in case the record does not exist." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Get a value from the KVS.\n" + } + ] + }, + "flags": {}, + "id": 616, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_value", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Key of the record to retrieve." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 617, + "kind": 32768, + "kindString": "Parameter", + "name": "key", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "Any", + "type": "reference" + } + }, + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "The value associated with the given key. `default_value` is used in case the record does not exist." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Get a value from the KVS.\n" + } + ] + }, + "flags": {}, + "id": 618, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_value", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Key of the record to retrieve." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 619, + "kind": 32768, + "kindString": "Parameter", + "name": "key", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Default value returned in case the record does not exist.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 620, + "kind": 32768, + "kindString": "Parameter", + "name": "default_value", + "type": { + "name": "T", + "type": "reference", + "target": "117" + } + } + ], + "type": { + "name": "T", + "type": "reference", + "target": "117" + } + }, + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "The value associated with the given key. `default_value` is used in case the record does not exist." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Get a value from the KVS.\n" + } + ] + }, + "flags": {}, + "id": 621, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_value", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Key of the record to retrieve." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 622, + "kind": 32768, + "kindString": "Parameter", + "name": "key", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Default value returned in case the record does not exist.\n" + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 623, + "kind": 32768, + "kindString": "Parameter", + "name": "default_value", + "type": { + "name": "T | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "T", + "target": "117" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "T | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "T", + "target": "117" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Iterate over the existing keys in the KVS.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 599, + "module": "storages._key_value_store", + "name": "iterate_keys", + "parsedDocstring": { + "text": "Iterate over the existing keys in the KVS.\n", + "args": { + "exclusive_start_key": "Key to start the iteration from.\n" + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_key_value_store.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 179 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Iterate over the existing keys in the KVS.\n" + } + ] + }, + "flags": {}, + "id": 600, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "iterate_keys", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Key to start the iteration from.\n" + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 601, + "kind": 32768, + "kindString": "Parameter", + "name": "exclusive_start_key", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "AsyncIterator", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "KeyValueStoreKeyInfo", + "target": "782" + } + ] + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Set a value in the KVS.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 602, + "module": "storages._key_value_store", + "name": "set_value", + "parsedDocstring": { + "text": "Set a value in the KVS.\n", + "args": { + "key": "Key of the record to set.", + "value": "Value to set. If `None`, the record is deleted.", + "content_type": "Content type of the record." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_key_value_store.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 197 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Set a value in the KVS.\n" + } + ] + }, + "flags": {}, + "id": 603, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "set_value", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Key of the record to set." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 604, + "kind": 32768, + "kindString": "Parameter", + "name": "key", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Value to set. If `None`, the record is deleted." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 605, + "kind": 32768, + "kindString": "Parameter", + "name": "value", + "type": { + "name": "Any", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Content type of the record." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 606, + "kind": 32768, + "kindString": "Parameter", + "name": "content_type", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the public URL for the given key.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 607, + "module": "storages._key_value_store", + "name": "get_public_url", + "parsedDocstring": { + "text": "Get the public URL for the given key.\n", + "args": { + "key": "Key of the record for which URL is required.\n" + }, + "returns": "The public URL for the given key." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_key_value_store.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 215 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "The public URL for the given key." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Get the public URL for the given key.\n" + } + ] + }, + "flags": {}, + "id": 608, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_public_url", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Key of the record for which URL is required.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 609, + "kind": 32768, + "kindString": "Parameter", + "name": "key", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "str", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get a value from KVS that will be automatically saved on changes.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 610, + "module": "storages._key_value_store", + "name": "get_auto_saved_value", + "parsedDocstring": { + "text": "Get a value from KVS that will be automatically saved on changes.\n", + "args": { + "key": "Key of the record, to store the value.", + "default_value": "Value to be used if the record does not exist yet. Should be a dictionary.\n" + }, + "returns": "Return the value of the key." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_key_value_store.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 226 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "Return the value of the key." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Get a value from KVS that will be automatically saved on changes.\n" + } + ] + }, + "flags": {}, + "id": 611, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_auto_saved_value", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Key of the record, to store the value." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 612, + "kind": 32768, + "kindString": "Parameter", + "name": "key", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Value to be used if the record does not exist yet. Should be a dictionary.\n" + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 613, + "kind": 32768, + "kindString": "Parameter", + "name": "default_value", + "type": { + "name": "dict[str, JsonSerializable] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "dict", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "JsonSerializable" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "dict", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "JsonSerializable" + } + ] + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Force autosaved values to be saved without waiting for an event in Event Manager." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 614, + "module": "storages._key_value_store", + "name": "persist_autosaved_values", + "parsedDocstring": { + "text": "Force autosaved values to be saved without waiting for an event in Event Manager." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_key_value_store.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 270 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Force autosaved values to be saved without waiting for an event in Event Manager." + } + ] + }, + "flags": {}, + "id": 615, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "persist_autosaved_values", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Represents a key-value based storage for reading and writing data records or files.\n\nEach data record is identified by a unique key and associated with a specific MIME content type. This class is\ncommonly used in crawler runs to store inputs and outputs, typically in JSON format, but it also supports other\ncontent types.\n\nData can be stored either locally or in the cloud. It depends on the setup of underlying storage client.\nBy default a `MemoryStorageClient` is used, but it can be changed to a different one.\n\nBy default, data is stored using the following path structure:\n```\n{CRAWLEE_STORAGE_DIR}/key_value_stores/{STORE_ID}/{KEY}.{EXT}\n```\n\n- `{CRAWLEE_STORAGE_DIR}`: The root directory for all storage data specified by the environment variable.\n- `{STORE_ID}`: The identifier for the key-value store, either \"default\" or as specified by\n`CRAWLEE_DEFAULT_KEY_VALUE_STORE_ID`.\n- `{KEY}`: The unique key for the record.\n- `{EXT}`: The file extension corresponding to the MIME type of the content.\n\nTo open a key-value store, use the `open` class method, providing an `id`, `name`, or optional `configuration`.\nIf none are specified, the default store for the current crawler run is used. Attempting to open a store by `id`\nthat does not exist will raise an error; however, if accessed by `name`, the store will be created if it does not\nalready exist.\n\n### Usage\n\n```python\nfrom crawlee.storages import KeyValueStore\n\nkvs = await KeyValueStore.open(name='my_kvs')\n```" + } + ] + }, + "decorations": [ + { + "args": "('Classes')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 570, + 593, + 575, + 610, + 585, + 607, + 595, + 599, + 587, + 614, + 602, + 582 + ], + "title": "Methods" + }, + { + "children": [ + 579, + 580, + 581 + ], + "title": "Properties" + } + ], + "id": 569, + "module": "storages._key_value_store", + "name": "KeyValueStore", + "parsedDocstring": { + "text": "Represents a key-value based storage for reading and writing data records or files.\n\nEach data record is identified by a unique key and associated with a specific MIME content type. This class is\ncommonly used in crawler runs to store inputs and outputs, typically in JSON format, but it also supports other\ncontent types.\n\nData can be stored either locally or in the cloud. It depends on the setup of underlying storage client.\nBy default a `MemoryStorageClient` is used, but it can be changed to a different one.\n\nBy default, data is stored using the following path structure:\n```\n{CRAWLEE_STORAGE_DIR}/key_value_stores/{STORE_ID}/{KEY}.{EXT}\n```\n\n- `{CRAWLEE_STORAGE_DIR}`: The root directory for all storage data specified by the environment variable.\n- `{STORE_ID}`: The identifier for the key-value store, either \"default\" or as specified by\n`CRAWLEE_DEFAULT_KEY_VALUE_STORE_ID`.\n- `{KEY}`: The unique key for the record.\n- `{EXT}`: The file extension corresponding to the MIME type of the content.\n\nTo open a key-value store, use the `open` class method, providing an `id`, `name`, or optional `configuration`.\nIf none are specified, the default store for the current crawler run is used. Attempting to open a store by `id`\nthat does not exist will raise an error; however, if accessed by `name`, the store will be created if it does not\nalready exist.\n\n### Usage\n\n```python\nfrom crawlee.storages import KeyValueStore\n\nkvs = await KeyValueStore.open(name='my_kvs')\n```" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_key_value_store.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 36 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "Storage", + "target": "734", + "type": "reference" + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 624, + "module": "storages._dataset", + "name": "logger", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 29 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Skip the specified number of items at the start." + } + ] + }, + "flags": {}, + "groups": [], + "id": 626, + "module": "storages._dataset", + "name": "offset", + "parsedDocstring": { + "text": "Skip the specified number of items at the start." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 35 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The maximum number of items to retrieve. Unlimited if None." + } + ] + }, + "flags": {}, + "groups": [], + "id": 627, + "module": "storages._dataset", + "name": "limit", + "parsedDocstring": { + "text": "The maximum number of items to retrieve. Unlimited if None." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 38 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return only non-empty items and excludes hidden fields. Shortcut for skip_hidden and skip_empty." + } + ] + }, + "flags": {}, + "groups": [], + "id": 628, + "module": "storages._dataset", + "name": "clean", + "parsedDocstring": { + "text": "Return only non-empty items and excludes hidden fields. Shortcut for skip_hidden and skip_empty." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 41 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Set to True to sort results in descending order." + } + ] + }, + "flags": {}, + "groups": [], + "id": 629, + "module": "storages._dataset", + "name": "desc", + "parsedDocstring": { + "text": "Set to True to sort results in descending order." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 44 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Fields to include in each item. Sorts fields as specified if provided." + } + ] + }, + "flags": {}, + "groups": [], + "id": 630, + "module": "storages._dataset", + "name": "fields", + "parsedDocstring": { + "text": "Fields to include in each item. Sorts fields as specified if provided." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 47 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ], + "target": "866" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Fields to exclude from each item." + } + ] + }, + "flags": {}, + "groups": [], + "id": 631, + "module": "storages._dataset", + "name": "omit", + "parsedDocstring": { + "text": "Fields to exclude from each item." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 50 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ], + "target": "866" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Unwind items by a specified array field, turning each element into a separate item." + } + ] + }, + "flags": {}, + "groups": [], + "id": 632, + "module": "storages._dataset", + "name": "unwind", + "parsedDocstring": { + "text": "Unwind items by a specified array field, turning each element into a separate item." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 53 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Exclude empty items from the results if True." + } + ] + }, + "flags": {}, + "groups": [], + "id": 633, + "module": "storages._dataset", + "name": "skip_empty", + "parsedDocstring": { + "text": "Exclude empty items from the results if True." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 56 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Exclude fields starting with '#' if True." + } + ] + }, + "flags": {}, + "groups": [], + "id": 634, + "module": "storages._dataset", + "name": "skip_hidden", + "parsedDocstring": { + "text": "Exclude fields starting with '#' if True." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 59 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Field to be flattened in returned items." + } + ] + }, + "flags": {}, + "groups": [], + "id": 635, + "module": "storages._dataset", + "name": "flatten", + "parsedDocstring": { + "text": "Field to be flattened in returned items." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 62 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ], + "target": "866" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Specify the dataset view to be used." + } + ] + }, + "flags": {}, + "groups": [], + "id": 636, + "module": "storages._dataset", + "name": "view", + "parsedDocstring": { + "text": "Specify the dataset view to be used." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 65 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Keyword arguments for dataset's `get_data` method." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 628, + 629, + 630, + 635, + 627, + 626, + 631, + 633, + 634, + 632, + 636 + ], + "title": "Properties" + } + ], + "id": 625, + "module": "storages._dataset", + "name": "GetDataKwargs", + "parsedDocstring": { + "text": "Keyword arguments for dataset's `get_data` method." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 32 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The key under which to save the data." + } + ] + }, + "flags": {}, + "groups": [], + "id": 638, + "module": "storages._dataset", + "name": "key", + "parsedDocstring": { + "text": "The key under which to save the data." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 72 + } + ], + "type": { + "name": "Required", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The format in which to export the data. Either 'json' or 'csv'." + } + ] + }, + "flags": {}, + "groups": [], + "id": 639, + "module": "storages._dataset", + "name": "content_type", + "parsedDocstring": { + "text": "The format in which to export the data. Either 'json' or 'csv'." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 75 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Literal", + "typeArguments": [ + { + "type": "literal", + "value": "json" + }, + { + "type": "literal", + "value": "csv" + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "ID of the key-value store to save the exported file." + } + ] + }, + "flags": {}, + "groups": [], + "id": 640, + "module": "storages._dataset", + "name": "to_key_value_store_id", + "parsedDocstring": { + "text": "ID of the key-value store to save the exported file." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 78 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Name of the key-value store to save the exported file." + } + ] + }, + "flags": {}, + "groups": [], + "id": 641, + "module": "storages._dataset", + "name": "to_key_value_store_name", + "parsedDocstring": { + "text": "Name of the key-value store to save the exported file." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 81 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Keyword arguments for dataset's `export_to` method." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 639, + 638, + 640, + 641 + ], + "title": "Properties" + } + ], + "id": 637, + "module": "storages._dataset", + "name": "ExportToKwargs", + "parsedDocstring": { + "text": "Keyword arguments for dataset's `export_to` method." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 69 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True (default: False), dict keys that are not of a basic type (str, int, float, bool, None) will be skipped\ninstead of raising a `TypeError`." + } + ] + }, + "flags": {}, + "groups": [], + "id": 643, + "module": "storages._dataset", + "name": "skipkeys", + "parsedDocstring": { + "text": "If True (default: False), dict keys that are not of a basic type (str, int, float, bool, None) will be skipped\ninstead of raising a `TypeError`." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 88 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Determines if non-ASCII characters should be escaped in the output JSON string." + } + ] + }, + "flags": {}, + "groups": [], + "id": 644, + "module": "storages._dataset", + "name": "ensure_ascii", + "parsedDocstring": { + "text": "Determines if non-ASCII characters should be escaped in the output JSON string." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 92 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If False (default: True), skips the circular reference check for container types. A circular reference will\nresult in a `RecursionError` or worse if unchecked." + } + ] + }, + "flags": {}, + "groups": [], + "id": 645, + "module": "storages._dataset", + "name": "check_circular", + "parsedDocstring": { + "text": "If False (default: True), skips the circular reference check for container types. A circular reference will\nresult in a `RecursionError` or worse if unchecked." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 95 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If False (default: True), raises a ValueError for out-of-range float values (nan, inf, -inf) to strictly comply\nwith the JSON specification. If True, uses their JavaScript equivalents (NaN, Infinity, -Infinity)." + } + ] + }, + "flags": {}, + "groups": [], + "id": 646, + "module": "storages._dataset", + "name": "allow_nan", + "parsedDocstring": { + "text": "If False (default: True), raises a ValueError for out-of-range float values (nan, inf, -inf) to strictly comply\nwith the JSON specification. If True, uses their JavaScript equivalents (NaN, Infinity, -Infinity)." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 99 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Allows specifying a custom JSON encoder." + } + ] + }, + "flags": {}, + "groups": [], + "id": 647, + "module": "storages._dataset", + "name": "cls", + "parsedDocstring": { + "text": "Allows specifying a custom JSON encoder." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 103 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "type", + "typeArguments": [ + { + "type": "reference", + "name": "json.JSONEncoder" + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Specifies the number of spaces for indentation in the pretty-printed JSON output." + } + ] + }, + "flags": {}, + "groups": [], + "id": 648, + "module": "storages._dataset", + "name": "indent", + "parsedDocstring": { + "text": "Specifies the number of spaces for indentation in the pretty-printed JSON output." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 106 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A tuple of (item_separator, key_separator). The default is (', ', ': ') if indent is None and (',', ': ')\notherwise." + } + ] + }, + "flags": {}, + "groups": [], + "id": 649, + "module": "storages._dataset", + "name": "separators", + "parsedDocstring": { + "text": "A tuple of (item_separator, key_separator). The default is (', ', ': ') if indent is None and (',', ': ')\notherwise." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 109 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "tuple", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "str" + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A function called for objects that can't be serialized otherwise. It should return a JSON-encodable version\nof the object or raise a `TypeError`." + } + ] + }, + "flags": {}, + "groups": [], + "id": 650, + "module": "storages._dataset", + "name": "default", + "parsedDocstring": { + "text": "A function called for objects that can't be serialized otherwise. It should return a JSON-encodable version\nof the object or raise a `TypeError`." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 113 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Callable" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Specifies whether the output JSON object should have keys sorted alphabetically." + } + ] + }, + "flags": {}, + "groups": [], + "id": 651, + "module": "storages._dataset", + "name": "sort_keys", + "parsedDocstring": { + "text": "Specifies whether the output JSON object should have keys sorted alphabetically." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 117 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Keyword arguments for dataset's `export_data_json` method." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 646, + 645, + 647, + 650, + 644, + 648, + 649, + 643, + 651 + ], + "title": "Properties" + } + ], + "id": 642, + "module": "storages._dataset", + "name": "ExportDataJsonKwargs", + "parsedDocstring": { + "text": "Keyword arguments for dataset's `export_data_json` method." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 85 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Specifies a dialect to be used in CSV parsing and writing." + } + ] + }, + "flags": {}, + "groups": [], + "id": 653, + "module": "storages._dataset", + "name": "dialect", + "parsedDocstring": { + "text": "Specifies a dialect to be used in CSV parsing and writing." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 124 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A one-character string used to separate fields. Defaults to ','." + } + ] + }, + "flags": {}, + "groups": [], + "id": 654, + "module": "storages._dataset", + "name": "delimiter", + "parsedDocstring": { + "text": "A one-character string used to separate fields. Defaults to ','." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 127 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Controls how instances of `quotechar` inside a field should be quoted. When True, the character is doubled;\nwhen False, the `escapechar` is used as a prefix. Defaults to True." + } + ] + }, + "flags": {}, + "groups": [], + "id": 655, + "module": "storages._dataset", + "name": "doublequote", + "parsedDocstring": { + "text": "Controls how instances of `quotechar` inside a field should be quoted. When True, the character is doubled;\nwhen False, the `escapechar` is used as a prefix. Defaults to True." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 130 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A one-character string used to escape the delimiter if `quoting` is set to `QUOTE_NONE` and the `quotechar`\nif `doublequote` is False. Defaults to None, disabling escaping." + } + ] + }, + "flags": {}, + "groups": [], + "id": 656, + "module": "storages._dataset", + "name": "escapechar", + "parsedDocstring": { + "text": "A one-character string used to escape the delimiter if `quoting` is set to `QUOTE_NONE` and the `quotechar`\nif `doublequote` is False. Defaults to None, disabling escaping." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 134 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The string used to terminate lines produced by the writer. Defaults to '\\r\\n'." + } + ] + }, + "flags": {}, + "groups": [], + "id": 657, + "module": "storages._dataset", + "name": "lineterminator", + "parsedDocstring": { + "text": "The string used to terminate lines produced by the writer. Defaults to '\\r\\n'." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 138 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A one-character string used to quote fields containing special characters, like the delimiter or quotechar,\nor fields containing new-line characters. Defaults to '\"'." + } + ] + }, + "flags": {}, + "groups": [], + "id": 658, + "module": "storages._dataset", + "name": "quotechar", + "parsedDocstring": { + "text": "A one-character string used to quote fields containing special characters, like the delimiter or quotechar,\nor fields containing new-line characters. Defaults to '\"'." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 141 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Controls when quotes should be generated by the writer and recognized by the reader. Can take any of\nthe `QUOTE_*` constants, with a default of `QUOTE_MINIMAL`." + } + ] + }, + "flags": {}, + "groups": [], + "id": 659, + "module": "storages._dataset", + "name": "quoting", + "parsedDocstring": { + "text": "Controls when quotes should be generated by the writer and recognized by the reader. Can take any of\nthe `QUOTE_*` constants, with a default of `QUOTE_MINIMAL`." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 145 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "When True, spaces immediately following the delimiter are ignored. Defaults to False." + } + ] + }, + "flags": {}, + "groups": [], + "id": 660, + "module": "storages._dataset", + "name": "skipinitialspace", + "parsedDocstring": { + "text": "When True, spaces immediately following the delimiter are ignored. Defaults to False." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 149 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "When True, raises an exception on bad CSV input. Defaults to False." + } + ] + }, + "flags": {}, + "groups": [], + "id": 661, + "module": "storages._dataset", + "name": "strict", + "parsedDocstring": { + "text": "When True, raises an exception on bad CSV input. Defaults to False." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 152 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Keyword arguments for dataset's `export_data_csv` method." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 654, + 653, + 655, + 656, + 657, + 658, + 659, + 660, + 661 + ], + "title": "Properties" + } + ], + "id": 652, + "module": "storages._dataset", + "name": "ExportDataCsvKwargs", + "parsedDocstring": { + "text": "Keyword arguments for dataset's `export_data_csv` method." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 121 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 663, + "module": "storages._dataset", + "name": "__init__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 198 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 664, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 665, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 666, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 667, + "kind": 32768, + "kindString": "Parameter", + "name": "storage_client", + "type": { + "name": "StorageClient", + "type": "reference", + "target": "1189" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance of Dataset from a storage metadata object." + } + ] + }, + "decorations": [ + { + "name": "classmethod" + } + ], + "flags": {}, + "groups": [], + "id": 668, + "module": "storages._dataset", + "name": "from_storage_object", + "parsedDocstring": { + "text": "Initialize a new instance of Dataset from a storage metadata object." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 211 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance of Dataset from a storage metadata object." + } + ] + }, + "flags": {}, + "id": 669, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "from_storage_object", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 670, + "kind": 32768, + "kindString": "Parameter", + "name": "storage_client", + "type": { + "name": "StorageClient", + "type": "reference", + "target": "1189" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 671, + "kind": 32768, + "kindString": "Parameter", + "name": "storage_object", + "type": { + "name": "StorageMetadata", + "type": "reference", + "target": "750" + } + } + ], + "type": { + "name": "Dataset", + "type": "reference", + "target": "662" + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the storage ID." + } + ] + }, + "decorations": [ + { + "name": "property" + }, + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 672, + "module": "storages._base", + "name": "id", + "parsedDocstring": { + "text": "Get the storage ID." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 224 + } + ], + "type": { + "name": "str", + "type": "reference" + }, + "overwrites": { + "name": "Storage.id", + "target": 735, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the storage name." + } + ] + }, + "decorations": [ + { + "name": "property" + }, + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 673, + "module": "storages._base", + "name": "name", + "parsedDocstring": { + "text": "Get the storage name." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 229 + } + ], + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + }, + "overwrites": { + "name": "Storage.name", + "target": 736, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the full storage object." + } + ] + }, + "decorations": [ + { + "name": "property" + }, + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 674, + "module": "storages._base", + "name": "storage_object", + "parsedDocstring": { + "text": "Get the full storage object." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 234 + } + ], + "type": { + "name": "StorageMetadata", + "type": "reference", + "target": "750" + }, + "overwrites": { + "name": "Storage.storage_object", + "target": 738, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "args": ".setter", + "name": "storage_object" + }, + { + "name": "override" + } + ], + "flags": {}, + "groups": [], + "id": 675, + "module": "storages._dataset", + "name": "storage_object", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 239 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 676, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "storage_object", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 677, + "kind": 32768, + "kindString": "Parameter", + "name": "storage_object", + "type": { + "name": "StorageMetadata", + "type": "reference", + "target": "750" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Open a storage, either restore existing or create a new one.\n" + } + ] + }, + "decorations": [ + { + "name": "classmethod" + }, + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 678, + "module": "storages._base", + "name": "open", + "parsedDocstring": { + "text": "Open a storage, either restore existing or create a new one.\n", + "args": { + "id": "The storage ID.", + "name": "The storage name.", + "configuration": "Configuration object used during the storage creation or restoration process.", + "storage_client": "Underlying storage client to use. If not provided, the default global storage client\nfrom the service locator will be used." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 244 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Open a storage, either restore existing or create a new one.\n" + } + ] + }, + "flags": {}, + "id": 742, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "open", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The storage ID." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 743, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The storage name." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 744, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Configuration object used during the storage creation or restoration process." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 745, + "kind": 32768, + "kindString": "Parameter", + "name": "configuration", + "type": { + "name": "Configuration | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Configuration", + "target": "93" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Underlying storage client to use. If not provided, the default global storage client\nfrom the service locator will be used." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 746, + "kind": 32768, + "kindString": "Parameter", + "name": "storage_client", + "type": { + "name": "StorageClient | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "StorageClient", + "target": "1189" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "Storage", + "type": "reference", + "target": "734" + }, + "overwrites": { + "name": "Storage.open", + "target": 741, + "type": "reference" + } + } + ], + "overwrites": { + "name": "Storage.open", + "target": 741, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Drop the storage, removing it from the underlying storage client and clearing the cache." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 684, + "module": "storages._base", + "name": "drop", + "parsedDocstring": { + "text": "Drop the storage, removing it from the underlying storage client and clearing the cache." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 266 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Drop the storage, removing it from the underlying storage client and clearing the cache." + } + ] + }, + "flags": {}, + "id": 748, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "drop", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "Storage.drop", + "target": 747, + "type": "reference" + } + } + ], + "overwrites": { + "name": "Storage.drop", + "target": 747, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Store an object or an array of objects to the dataset.\n\nThe size of the data is limited by the receiving API and therefore `push_data()` will only\nallow objects whose JSON representation is smaller than 9MB. When an array is passed,\nnone of the included objects may be larger than 9MB, but the array itself may be of any size.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 686, + "module": "storages._dataset", + "name": "push_data", + "parsedDocstring": { + "text": "Store an object or an array of objects to the dataset.\n\nThe size of the data is limited by the receiving API and therefore `push_data()` will only\nallow objects whose JSON representation is smaller than 9MB. When an array is passed,\nnone of the included objects may be larger than 9MB, but the array itself may be of any size.\n", + "args": { + "data": "A JSON serializable data structure to be stored in the dataset. The JSON representation\nof each item must be smaller than 9MB.", + "kwargs": "Keyword arguments for the storage client method." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 272 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Store an object or an array of objects to the dataset.\n\nThe size of the data is limited by the receiving API and therefore `push_data()` will only\nallow objects whose JSON representation is smaller than 9MB. When an array is passed,\nnone of the included objects may be larger than 9MB, but the array itself may be of any size.\n" + } + ] + }, + "flags": {}, + "id": 687, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "push_data", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A JSON serializable data structure to be stored in the dataset. The JSON representation\nof each item must be smaller than 9MB." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 688, + "kind": 32768, + "kindString": "Parameter", + "name": "data", + "type": { + "name": "JsonSerializable", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Retrieve dataset items based on filtering, sorting, and pagination parameters.\n\nThis method allows customization of the data retrieval process from a dataset, supporting operations such as\nfield selection, ordering, and skipping specific records based on provided parameters.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 690, + "module": "storages._dataset", + "name": "get_data", + "parsedDocstring": { + "text": "Retrieve dataset items based on filtering, sorting, and pagination parameters.\n\nThis method allows customization of the data retrieval process from a dataset, supporting operations such as\nfield selection, ordering, and skipping specific records based on provided parameters.\n", + "args": { + "kwargs": "Keyword arguments for the storage client method.\n" + }, + "returns": "List page containing filtered and paginated dataset items." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 298 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "List page containing filtered and paginated dataset items." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Retrieve dataset items based on filtering, sorting, and pagination parameters.\n\nThis method allows customization of the data retrieval process from a dataset, supporting operations such as\nfield selection, ordering, and skipping specific records based on provided parameters.\n" + } + ] + }, + "flags": {}, + "id": 691, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_data", + "parameters": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Skip the specified number of items at the start." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 626, + "module": "storages._dataset", + "name": "offset", + "parsedDocstring": { + "text": "Skip the specified number of items at the start." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 35 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The maximum number of items to retrieve. Unlimited if None." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 627, + "module": "storages._dataset", + "name": "limit", + "parsedDocstring": { + "text": "The maximum number of items to retrieve. Unlimited if None." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 38 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return only non-empty items and excludes hidden fields. Shortcut for skip_hidden and skip_empty." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 628, + "module": "storages._dataset", + "name": "clean", + "parsedDocstring": { + "text": "Return only non-empty items and excludes hidden fields. Shortcut for skip_hidden and skip_empty." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 41 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Set to True to sort results in descending order." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 629, + "module": "storages._dataset", + "name": "desc", + "parsedDocstring": { + "text": "Set to True to sort results in descending order." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 44 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Fields to include in each item. Sorts fields as specified if provided." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 630, + "module": "storages._dataset", + "name": "fields", + "parsedDocstring": { + "text": "Fields to include in each item. Sorts fields as specified if provided." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 47 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ], + "target": "866" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Fields to exclude from each item." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 631, + "module": "storages._dataset", + "name": "omit", + "parsedDocstring": { + "text": "Fields to exclude from each item." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 50 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ], + "target": "866" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Unwind items by a specified array field, turning each element into a separate item." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 632, + "module": "storages._dataset", + "name": "unwind", + "parsedDocstring": { + "text": "Unwind items by a specified array field, turning each element into a separate item." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 53 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Exclude empty items from the results if True." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 633, + "module": "storages._dataset", + "name": "skip_empty", + "parsedDocstring": { + "text": "Exclude empty items from the results if True." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 56 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Exclude fields starting with '#' if True." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 634, + "module": "storages._dataset", + "name": "skip_hidden", + "parsedDocstring": { + "text": "Exclude fields starting with '#' if True." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 59 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Field to be flattened in returned items." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 635, + "module": "storages._dataset", + "name": "flatten", + "parsedDocstring": { + "text": "Field to be flattened in returned items." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 62 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ], + "target": "866" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Specify the dataset view to be used." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 636, + "module": "storages._dataset", + "name": "view", + "parsedDocstring": { + "text": "Specify the dataset view to be used." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 65 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + } + ], + "type": { + "name": "DatasetItemsListPage", + "type": "reference", + "target": "823" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Export the entire dataset into an arbitrary stream.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 693, + "module": "storages._dataset", + "name": "write_to_csv", + "parsedDocstring": { + "text": "Export the entire dataset into an arbitrary stream.\n", + "args": { + "destination": "The stream into which the dataset contents should be written.", + "kwargs": "Additional keyword arguments for `csv.writer`." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 312 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Export the entire dataset into an arbitrary stream.\n" + } + ] + }, + "flags": {}, + "id": 694, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "write_to_csv", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The stream into which the dataset contents should be written." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 695, + "kind": 32768, + "kindString": "Parameter", + "name": "destination", + "type": { + "name": "TextIO", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Specifies a dialect to be used in CSV parsing and writing." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 653, + "module": "storages._dataset", + "name": "dialect", + "parsedDocstring": { + "text": "Specifies a dialect to be used in CSV parsing and writing." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 124 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A one-character string used to separate fields. Defaults to ','." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 654, + "module": "storages._dataset", + "name": "delimiter", + "parsedDocstring": { + "text": "A one-character string used to separate fields. Defaults to ','." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 127 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Controls how instances of `quotechar` inside a field should be quoted. When True, the character is doubled;\nwhen False, the `escapechar` is used as a prefix. Defaults to True." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 655, + "module": "storages._dataset", + "name": "doublequote", + "parsedDocstring": { + "text": "Controls how instances of `quotechar` inside a field should be quoted. When True, the character is doubled;\nwhen False, the `escapechar` is used as a prefix. Defaults to True." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 130 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A one-character string used to escape the delimiter if `quoting` is set to `QUOTE_NONE` and the `quotechar`\nif `doublequote` is False. Defaults to None, disabling escaping." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 656, + "module": "storages._dataset", + "name": "escapechar", + "parsedDocstring": { + "text": "A one-character string used to escape the delimiter if `quoting` is set to `QUOTE_NONE` and the `quotechar`\nif `doublequote` is False. Defaults to None, disabling escaping." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 134 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The string used to terminate lines produced by the writer. Defaults to '\\r\\n'." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 657, + "module": "storages._dataset", + "name": "lineterminator", + "parsedDocstring": { + "text": "The string used to terminate lines produced by the writer. Defaults to '\\r\\n'." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 138 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A one-character string used to quote fields containing special characters, like the delimiter or quotechar,\nor fields containing new-line characters. Defaults to '\"'." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 658, + "module": "storages._dataset", + "name": "quotechar", + "parsedDocstring": { + "text": "A one-character string used to quote fields containing special characters, like the delimiter or quotechar,\nor fields containing new-line characters. Defaults to '\"'." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 141 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Controls when quotes should be generated by the writer and recognized by the reader. Can take any of\nthe `QUOTE_*` constants, with a default of `QUOTE_MINIMAL`." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 659, + "module": "storages._dataset", + "name": "quoting", + "parsedDocstring": { + "text": "Controls when quotes should be generated by the writer and recognized by the reader. Can take any of\nthe `QUOTE_*` constants, with a default of `QUOTE_MINIMAL`." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 145 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "When True, spaces immediately following the delimiter are ignored. Defaults to False." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 660, + "module": "storages._dataset", + "name": "skipinitialspace", + "parsedDocstring": { + "text": "When True, spaces immediately following the delimiter are ignored. Defaults to False." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 149 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "When True, raises an exception on bad CSV input. Defaults to False." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 661, + "module": "storages._dataset", + "name": "strict", + "parsedDocstring": { + "text": "When True, raises an exception on bad CSV input. Defaults to False." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 152 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Export the entire dataset into an arbitrary stream.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 697, + "module": "storages._dataset", + "name": "write_to_json", + "parsedDocstring": { + "text": "Export the entire dataset into an arbitrary stream.\n", + "args": { + "destination": "The stream into which the dataset contents should be written.", + "kwargs": "Additional keyword arguments for `json.dump`." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 336 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Export the entire dataset into an arbitrary stream.\n" + } + ] + }, + "flags": {}, + "id": 698, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "write_to_json", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The stream into which the dataset contents should be written." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 699, + "kind": 32768, + "kindString": "Parameter", + "name": "destination", + "type": { + "name": "TextIO", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True (default: False), dict keys that are not of a basic type (str, int, float, bool, None) will be skipped\ninstead of raising a `TypeError`." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 643, + "module": "storages._dataset", + "name": "skipkeys", + "parsedDocstring": { + "text": "If True (default: False), dict keys that are not of a basic type (str, int, float, bool, None) will be skipped\ninstead of raising a `TypeError`." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 88 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Determines if non-ASCII characters should be escaped in the output JSON string." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 644, + "module": "storages._dataset", + "name": "ensure_ascii", + "parsedDocstring": { + "text": "Determines if non-ASCII characters should be escaped in the output JSON string." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 92 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If False (default: True), skips the circular reference check for container types. A circular reference will\nresult in a `RecursionError` or worse if unchecked." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 645, + "module": "storages._dataset", + "name": "check_circular", + "parsedDocstring": { + "text": "If False (default: True), skips the circular reference check for container types. A circular reference will\nresult in a `RecursionError` or worse if unchecked." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 95 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If False (default: True), raises a ValueError for out-of-range float values (nan, inf, -inf) to strictly comply\nwith the JSON specification. If True, uses their JavaScript equivalents (NaN, Infinity, -Infinity)." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 646, + "module": "storages._dataset", + "name": "allow_nan", + "parsedDocstring": { + "text": "If False (default: True), raises a ValueError for out-of-range float values (nan, inf, -inf) to strictly comply\nwith the JSON specification. If True, uses their JavaScript equivalents (NaN, Infinity, -Infinity)." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 99 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Allows specifying a custom JSON encoder." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 647, + "module": "storages._dataset", + "name": "cls", + "parsedDocstring": { + "text": "Allows specifying a custom JSON encoder." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 103 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "type", + "typeArguments": [ + { + "type": "reference", + "name": "json.JSONEncoder" + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Specifies the number of spaces for indentation in the pretty-printed JSON output." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 648, + "module": "storages._dataset", + "name": "indent", + "parsedDocstring": { + "text": "Specifies the number of spaces for indentation in the pretty-printed JSON output." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 106 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A tuple of (item_separator, key_separator). The default is (', ', ': ') if indent is None and (',', ': ')\notherwise." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 649, + "module": "storages._dataset", + "name": "separators", + "parsedDocstring": { + "text": "A tuple of (item_separator, key_separator). The default is (', ', ': ') if indent is None and (',', ': ')\notherwise." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 109 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "tuple", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "str" + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A function called for objects that can't be serialized otherwise. It should return a JSON-encodable version\nof the object or raise a `TypeError`." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 650, + "module": "storages._dataset", + "name": "default", + "parsedDocstring": { + "text": "A function called for objects that can't be serialized otherwise. It should return a JSON-encodable version\nof the object or raise a `TypeError`." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 113 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Callable" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Specifies whether the output JSON object should have keys sorted alphabetically." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 651, + "module": "storages._dataset", + "name": "sort_keys", + "parsedDocstring": { + "text": "Specifies whether the output JSON object should have keys sorted alphabetically." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 117 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Export the entire dataset into a specified file stored under a key in a key-value store.\n\nThis method consolidates all entries from a specified dataset into one file, which is then saved under a\ngiven key in a key-value store. The format of the exported file is determined by the `content_type` parameter.\nEither the dataset's ID or name should be specified, and similarly, either the target key-value store's ID or\nname should be used.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 701, + "module": "storages._dataset", + "name": "export_to", + "parsedDocstring": { + "text": "Export the entire dataset into a specified file stored under a key in a key-value store.\n\nThis method consolidates all entries from a specified dataset into one file, which is then saved under a\ngiven key in a key-value store. The format of the exported file is determined by the `content_type` parameter.\nEither the dataset's ID or name should be specified, and similarly, either the target key-value store's ID or\nname should be used.\n", + "args": { + "kwargs": "Keyword arguments for the storage client method." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 359 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Export the entire dataset into a specified file stored under a key in a key-value store.\n\nThis method consolidates all entries from a specified dataset into one file, which is then saved under a\ngiven key in a key-value store. The format of the exported file is determined by the `content_type` parameter.\nEither the dataset's ID or name should be specified, and similarly, either the target key-value store's ID or\nname should be used.\n" + } + ] + }, + "flags": {}, + "id": 702, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "export_to", + "parameters": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The key under which to save the data." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 638, + "module": "storages._dataset", + "name": "key", + "parsedDocstring": { + "text": "The key under which to save the data." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 72 + } + ], + "type": { + "name": "Required", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The format in which to export the data. Either 'json' or 'csv'." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 639, + "module": "storages._dataset", + "name": "content_type", + "parsedDocstring": { + "text": "The format in which to export the data. Either 'json' or 'csv'." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 75 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Literal", + "typeArguments": [ + { + "type": "literal", + "value": "json" + }, + { + "type": "literal", + "value": "csv" + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "ID of the key-value store to save the exported file." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 640, + "module": "storages._dataset", + "name": "to_key_value_store_id", + "parsedDocstring": { + "text": "ID of the key-value store to save the exported file." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 78 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Name of the key-value store to save the exported file." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 641, + "module": "storages._dataset", + "name": "to_key_value_store_name", + "parsedDocstring": { + "text": "Name of the key-value store to save the exported file." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 81 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get an object containing general information about the dataset." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 704, + "module": "storages._dataset", + "name": "get_info", + "parsedDocstring": { + "text": "Get an object containing general information about the dataset." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 391 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get an object containing general information about the dataset." + } + ] + }, + "flags": {}, + "id": 705, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_info", + "parameters": [], + "type": { + "name": "DatasetMetadata | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "DatasetMetadata", + "target": "757" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Iterate over dataset items, applying filtering, sorting, and pagination.\n\nRetrieve dataset items incrementally, allowing fine-grained control over the data fetched. The function\nsupports various parameters to filter, sort, and limit the data returned, facilitating tailored dataset\nqueries.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 706, + "module": "storages._dataset", + "name": "iterate_items", + "parsedDocstring": { + "text": "Iterate over dataset items, applying filtering, sorting, and pagination.\n\nRetrieve dataset items incrementally, allowing fine-grained control over the data fetched. The function\nsupports various parameters to filter, sort, and limit the data returned, facilitating tailored dataset\nqueries.\n", + "args": { + "offset": "Initial number of items to skip.", + "limit": "Max number of items to return. No limit if None.", + "clean": "Filter out empty items and hidden fields if True.", + "desc": "Return items in reverse order if True.", + "fields": "Specific fields to include in each item.", + "omit": "Fields to omit from each item.", + "unwind": "Field name to unwind items by.", + "skip_empty": "Omits empty items if True.", + "skip_hidden": "Excludes fields starting with '#' if True.\n" + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 398 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Iterate over dataset items, applying filtering, sorting, and pagination.\n\nRetrieve dataset items incrementally, allowing fine-grained control over the data fetched. The function\nsupports various parameters to filter, sort, and limit the data returned, facilitating tailored dataset\nqueries.\n" + } + ] + }, + "flags": {}, + "id": 707, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "iterate_items", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initial number of items to skip." + } + ] + }, + "defaultValue": "0", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 708, + "kind": 32768, + "kindString": "Parameter", + "name": "offset", + "type": { + "name": "int", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Max number of items to return. No limit if None." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 709, + "kind": 32768, + "kindString": "Parameter", + "name": "limit", + "type": { + "name": "int | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Filter out empty items and hidden fields if True." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 710, + "kind": 32768, + "kindString": "Parameter", + "name": "clean", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return items in reverse order if True." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 711, + "kind": 32768, + "kindString": "Parameter", + "name": "desc", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Specific fields to include in each item." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 712, + "kind": 32768, + "kindString": "Parameter", + "name": "fields", + "type": { + "name": "list[str] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ], + "target": "866" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Fields to omit from each item." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 713, + "kind": 32768, + "kindString": "Parameter", + "name": "omit", + "type": { + "name": "list[str] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ], + "target": "866" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Field name to unwind items by." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 714, + "kind": 32768, + "kindString": "Parameter", + "name": "unwind", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Omits empty items if True." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 715, + "kind": 32768, + "kindString": "Parameter", + "name": "skip_empty", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Excludes fields starting with '#' if True.\n" + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 716, + "kind": 32768, + "kindString": "Parameter", + "name": "skip_hidden", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "AsyncIterator", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "dict" + } + ] + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Serialize a given item to JSON, checks its serializability and size against a limit.\n" + } + ] + }, + "decorations": [ + { + "name": "classmethod" + } + ], + "flags": {}, + "groups": [], + "id": 717, + "module": "storages._dataset", + "name": "check_and_serialize", + "parsedDocstring": { + "text": "Serialize a given item to JSON, checks its serializability and size against a limit.\n", + "args": { + "item": "The item to serialize.", + "index": "Index of the item, used for error context.\n" + }, + "returns": "Serialized JSON string." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 445 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "Serialized JSON string." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Serialize a given item to JSON, checks its serializability and size against a limit.\n" + } + ] + }, + "flags": {}, + "id": 718, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "check_and_serialize", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The item to serialize." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 719, + "kind": 32768, + "kindString": "Parameter", + "name": "item", + "type": { + "name": "JsonSerializable", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Index of the item, used for error context.\n" + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 720, + "kind": 32768, + "kindString": "Parameter", + "name": "index", + "type": { + "name": "int | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "str", + "type": "reference" + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Represents an append-only structured storage, ideal for tabular data similar to database tables.\n\nThe `Dataset` class is designed to store structured data, where each entry (row) maintains consistent attributes\n(columns) across the dataset. It operates in an append-only mode, allowing new records to be added, but not\nmodified or deleted. This makes it particularly useful for storing results from web crawling operations.\n\nData can be stored either locally or in the cloud. It depends on the setup of underlying storage client.\nBy default a `MemoryStorageClient` is used, but it can be changed to a different one.\n\nBy default, data is stored using the following path structure:\n```\n{CRAWLEE_STORAGE_DIR}/datasets/{DATASET_ID}/{INDEX}.json\n```\n\n- `{CRAWLEE_STORAGE_DIR}`: The root directory for all storage data specified by the environment variable.\n- `{DATASET_ID}`: Specifies the dataset, either \"default\" or a custom dataset ID.\n- `{INDEX}`: Represents the zero-based index of the record within the dataset.\n\nTo open a dataset, use the `open` class method by specifying an `id`, `name`, or `configuration`. If none are\nprovided, the default dataset for the current crawler run is used. Attempting to open a dataset by `id` that does\nnot exist will raise an error; however, if accessed by `name`, the dataset will be created if it doesn't already\nexist.\n\n### Usage\n\n```python\nfrom crawlee.storages import Dataset\n\ndataset = await Dataset.open(name='my_dataset')\n```" + } + ] + }, + "decorations": [ + { + "args": "('Classes')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 663, + 717, + 684, + 701, + 668, + 690, + 704, + 706, + 678, + 686, + 675, + 693, + 697 + ], + "title": "Methods" + }, + { + "children": [ + 672, + 673, + 674 + ], + "title": "Properties" + } + ], + "id": 662, + "module": "storages._dataset", + "name": "Dataset", + "parsedDocstring": { + "text": "Represents an append-only structured storage, ideal for tabular data similar to database tables.\n\nThe `Dataset` class is designed to store structured data, where each entry (row) maintains consistent attributes\n(columns) across the dataset. It operates in an append-only mode, allowing new records to be added, but not\nmodified or deleted. This makes it particularly useful for storing results from web crawling operations.\n\nData can be stored either locally or in the cloud. It depends on the setup of underlying storage client.\nBy default a `MemoryStorageClient` is used, but it can be changed to a different one.\n\nBy default, data is stored using the following path structure:\n```\n{CRAWLEE_STORAGE_DIR}/datasets/{DATASET_ID}/{INDEX}.json\n```\n\n- `{CRAWLEE_STORAGE_DIR}`: The root directory for all storage data specified by the environment variable.\n- `{DATASET_ID}`: Specifies the dataset, either \"default\" or a custom dataset ID.\n- `{INDEX}`: Represents the zero-based index of the record within the dataset.\n\nTo open a dataset, use the `open` class method by specifying an `id`, `name`, or `configuration`. If none are\nprovided, the default dataset for the current crawler run is used. Attempting to open a dataset by `id` that does\nnot exist will raise an error; however, if accessed by `name`, the dataset will be created if it doesn't already\nexist.\n\n### Usage\n\n```python\nfrom crawlee.storages import Dataset\n\ndataset = await Dataset.open(name='my_dataset')\n```" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 157 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "Storage", + "target": "734", + "type": "reference" + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 721, + "module": "storages._creation_management", + "name": "TResource", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_creation_management.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 17 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Open either a new storage or restore an existing one and return it." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 722, + "module": "storages._creation_management", + "name": "open_storage", + "parsedDocstring": { + "text": "Open either a new storage or restore an existing one and return it." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_creation_management.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 124 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Open either a new storage or restore an existing one and return it." + } + ] + }, + "flags": {}, + "id": 723, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "open_storage", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 724, + "kind": 32768, + "kindString": "Parameter", + "name": "storage_class", + "type": { + "name": "type", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "TResource", + "target": "721" + } + ] + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 725, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 726, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 727, + "kind": 32768, + "kindString": "Parameter", + "name": "configuration", + "type": { + "name": "Configuration", + "type": "reference", + "target": "93" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 728, + "kind": 32768, + "kindString": "Parameter", + "name": "storage_client", + "type": { + "name": "StorageClient", + "type": "reference", + "target": "1189" + } + } + ], + "type": { + "name": "TResource", + "type": "reference", + "target": "721" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Remove a storage from cache by ID or name." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 729, + "module": "storages._creation_management", + "name": "remove_storage_from_cache", + "parsedDocstring": { + "text": "Remove a storage from cache by ID or name." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_creation_management.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 187 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Remove a storage from cache by ID or name." + } + ] + }, + "flags": {}, + "id": 730, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "remove_storage_from_cache", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 731, + "kind": 32768, + "kindString": "Parameter", + "name": "storage_class", + "type": { + "name": "type", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 732, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 733, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the storage ID." + } + ] + }, + "decorations": [ + { + "name": "property" + }, + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 735, + "module": "storages._base", + "name": "id", + "parsedDocstring": { + "text": "Get the storage ID." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_base.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 17 + } + ], + "type": { + "name": "str", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the storage name." + } + ] + }, + "decorations": [ + { + "name": "property" + }, + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 736, + "module": "storages._base", + "name": "name", + "parsedDocstring": { + "text": "Get the storage name." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_base.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 22 + } + ], + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the full storage object." + } + ] + }, + "decorations": [ + { + "name": "property" + }, + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 737, + "module": "storages._base", + "name": "storage_object", + "parsedDocstring": { + "text": "Get the full storage object." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_base.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 27 + } + ], + "type": { + "name": "StorageMetadata", + "type": "reference", + "target": "750" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Set the full storage object." + } + ] + }, + "decorations": [ + { + "args": ".setter", + "name": "storage_object" + }, + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 738, + "module": "storages._base", + "name": "storage_object", + "parsedDocstring": { + "text": "Set the full storage object." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_base.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 32 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Set the full storage object." + } + ] + }, + "flags": {}, + "id": 739, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "storage_object", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 740, + "kind": 32768, + "kindString": "Parameter", + "name": "storage_object", + "type": { + "name": "StorageMetadata", + "type": "reference", + "target": "750" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Open a storage, either restore existing or create a new one.\n" + } + ] + }, + "decorations": [ + { + "name": "classmethod" + }, + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 741, + "module": "storages._base", + "name": "open", + "parsedDocstring": { + "text": "Open a storage, either restore existing or create a new one.\n", + "args": { + "id": "The storage ID.", + "name": "The storage name.", + "configuration": "Configuration object used during the storage creation or restoration process.", + "storage_client": "Underlying storage client to use. If not provided, the default global storage client\nfrom the service locator will be used." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_base.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 37 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Open a storage, either restore existing or create a new one.\n" + } + ] + }, + "flags": {}, + "id": 742, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "open", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The storage ID." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 743, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The storage name." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 744, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Configuration object used during the storage creation or restoration process." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 745, + "kind": 32768, + "kindString": "Parameter", + "name": "configuration", + "type": { + "name": "Configuration | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Configuration", + "target": "93" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Underlying storage client to use. If not provided, the default global storage client\nfrom the service locator will be used." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 746, + "kind": 32768, + "kindString": "Parameter", + "name": "storage_client", + "type": { + "name": "StorageClient | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "StorageClient", + "target": "1189" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "Storage", + "type": "reference", + "target": "734" + }, + "overwrites": { + "name": "Storage.open", + "target": 741, + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Drop the storage, removing it from the underlying storage client and clearing the cache." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 747, + "module": "storages._base", + "name": "drop", + "parsedDocstring": { + "text": "Drop the storage, removing it from the underlying storage client and clearing the cache." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_base.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 56 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Drop the storage, removing it from the underlying storage client and clearing the cache." + } + ] + }, + "flags": {}, + "id": 748, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "drop", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "Storage.drop", + "target": 747, + "type": "reference" + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Base class for storages." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 747, + 741, + 738 + ], + "title": "Methods" + }, + { + "children": [ + 735, + 736, + 737 + ], + "title": "Properties" + } + ], + "id": 734, + "module": "storages._base", + "name": "Storage", + "parsedDocstring": { + "text": "Base class for storages." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_base.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 12 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedBy": [ + { + "name": "RequestQueue", + "target": "507", + "type": "reference" + }, + { + "name": "KeyValueStore", + "target": "569", + "type": "reference" + }, + { + "name": "Dataset", + "target": "662", + "type": "reference" + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 749, + "module": "storage_clients.models", + "name": "KvsValueType", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 16 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 751, + "module": "storage_clients.models", + "name": "model_config", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 26 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 752, + "module": "storage_clients.models", + "name": "id", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 28 + } + ], + "type": { + "name": "str", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 753, + "module": "storage_clients.models", + "name": "name", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 29 + } + ], + "type": { + "name": "Annotated[str | None, Field(alias='name', default='')]", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 754, + "module": "storage_clients.models", + "name": "accessed_at", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 30 + } + ], + "type": { + "name": "datetime", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 755, + "module": "storage_clients.models", + "name": "created_at", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 31 + } + ], + "type": { + "name": "datetime", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 756, + "module": "storage_clients.models", + "name": "modified_at", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 32 + } + ], + "type": { + "name": "datetime", + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Represents the base model for storage metadata.\n\nIt contains common fields shared across all specific storage types." + } + ] + }, + "decorations": [ + { + "args": "('Data structures')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 754, + 755, + 752, + 751, + 756, + 753 + ], + "title": "Properties" + } + ], + "id": 750, + "module": "storage_clients.models", + "name": "StorageMetadata", + "parsedDocstring": { + "text": "Represents the base model for storage metadata.\n\nIt contains common fields shared across all specific storage types." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 20 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedBy": [ + { + "name": "DatasetMetadata", + "target": "757", + "type": "reference" + }, + { + "name": "KeyValueStoreMetadata", + "target": "760", + "type": "reference" + }, + { + "name": "RequestQueueMetadata", + "target": "763", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 758, + "module": "storage_clients.models", + "name": "model_config", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 39 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "overwrites": { + "name": "StorageMetadata.model_config", + "target": 751, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 759, + "module": "storage_clients.models", + "name": "item_count", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 41 + } + ], + "type": { + "name": "int", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3569, + "module": "storage_clients.models", + "name": "id", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 28 + } + ], + "type": { + "name": "str", + "type": "reference" + }, + "inheritedFrom": { + "name": "StorageMetadata.id", + "target": 752, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3570, + "module": "storage_clients.models", + "name": "name", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 29 + } + ], + "type": { + "name": "Annotated[str | None, Field(alias='name', default='')]", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + }, + "inheritedFrom": { + "name": "StorageMetadata.name", + "target": 753, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3571, + "module": "storage_clients.models", + "name": "accessed_at", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 30 + } + ], + "type": { + "name": "datetime", + "type": "reference" + }, + "inheritedFrom": { + "name": "StorageMetadata.accessed_at", + "target": 754, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3572, + "module": "storage_clients.models", + "name": "created_at", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 31 + } + ], + "type": { + "name": "datetime", + "type": "reference" + }, + "inheritedFrom": { + "name": "StorageMetadata.created_at", + "target": 755, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3573, + "module": "storage_clients.models", + "name": "modified_at", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 32 + } + ], + "type": { + "name": "datetime", + "type": "reference" + }, + "inheritedFrom": { + "name": "StorageMetadata.modified_at", + "target": 756, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Model for a dataset metadata." + } + ] + }, + "decorations": [ + { + "args": "('Data structures')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 3571, + 3572, + 3569, + 759, + 758, + 3573, + 3570 + ], + "title": "Properties" + } + ], + "id": 757, + "module": "storage_clients.models", + "name": "DatasetMetadata", + "parsedDocstring": { + "text": "Model for a dataset metadata." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 36 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "StorageMetadata", + "target": "750", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 761, + "module": "storage_clients.models", + "name": "model_config", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 48 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "overwrites": { + "name": "StorageMetadata.model_config", + "target": 751, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 762, + "module": "storage_clients.models", + "name": "user_id", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 50 + } + ], + "type": { + "name": "str", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3574, + "module": "storage_clients.models", + "name": "id", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 28 + } + ], + "type": { + "name": "str", + "type": "reference" + }, + "inheritedFrom": { + "name": "StorageMetadata.id", + "target": 752, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3575, + "module": "storage_clients.models", + "name": "name", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 29 + } + ], + "type": { + "name": "Annotated[str | None, Field(alias='name', default='')]", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + }, + "inheritedFrom": { + "name": "StorageMetadata.name", + "target": 753, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3576, + "module": "storage_clients.models", + "name": "accessed_at", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 30 + } + ], + "type": { + "name": "datetime", + "type": "reference" + }, + "inheritedFrom": { + "name": "StorageMetadata.accessed_at", + "target": 754, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3577, + "module": "storage_clients.models", + "name": "created_at", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 31 + } + ], + "type": { + "name": "datetime", + "type": "reference" + }, + "inheritedFrom": { + "name": "StorageMetadata.created_at", + "target": 755, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3578, + "module": "storage_clients.models", + "name": "modified_at", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 32 + } + ], + "type": { + "name": "datetime", + "type": "reference" + }, + "inheritedFrom": { + "name": "StorageMetadata.modified_at", + "target": 756, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Model for a key-value store metadata." + } + ] + }, + "decorations": [ + { + "args": "('Data structures')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 3576, + 3577, + 3574, + 761, + 3578, + 3575, + 762 + ], + "title": "Properties" + } + ], + "id": 760, + "module": "storage_clients.models", + "name": "KeyValueStoreMetadata", + "parsedDocstring": { + "text": "Model for a key-value store metadata." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 45 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "StorageMetadata", + "target": "750", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 764, + "module": "storage_clients.models", + "name": "model_config", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 57 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "overwrites": { + "name": "StorageMetadata.model_config", + "target": 751, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 765, + "module": "storage_clients.models", + "name": "had_multiple_clients", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 59 + } + ], + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 766, + "module": "storage_clients.models", + "name": "handled_request_count", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 60 + } + ], + "type": { + "name": "int", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 767, + "module": "storage_clients.models", + "name": "pending_request_count", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 61 + } + ], + "type": { + "name": "int", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 768, + "module": "storage_clients.models", + "name": "stats", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 62 + } + ], + "type": { + "name": "dict", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 769, + "module": "storage_clients.models", + "name": "total_request_count", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 63 + } + ], + "type": { + "name": "int", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 770, + "module": "storage_clients.models", + "name": "user_id", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 64 + } + ], + "type": { + "name": "str", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 771, + "module": "storage_clients.models", + "name": "resource_directory", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 65 + } + ], + "type": { + "name": "str", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3579, + "module": "storage_clients.models", + "name": "id", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 28 + } + ], + "type": { + "name": "str", + "type": "reference" + }, + "inheritedFrom": { + "name": "StorageMetadata.id", + "target": 752, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3580, + "module": "storage_clients.models", + "name": "name", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 29 + } + ], + "type": { + "name": "Annotated[str | None, Field(alias='name', default='')]", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + }, + "inheritedFrom": { + "name": "StorageMetadata.name", + "target": 753, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3581, + "module": "storage_clients.models", + "name": "accessed_at", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 30 + } + ], + "type": { + "name": "datetime", + "type": "reference" + }, + "inheritedFrom": { + "name": "StorageMetadata.accessed_at", + "target": 754, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3582, + "module": "storage_clients.models", + "name": "created_at", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 31 + } + ], + "type": { + "name": "datetime", + "type": "reference" + }, + "inheritedFrom": { + "name": "StorageMetadata.created_at", + "target": 755, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3583, + "module": "storage_clients.models", + "name": "modified_at", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 32 + } + ], + "type": { + "name": "datetime", + "type": "reference" + }, + "inheritedFrom": { + "name": "StorageMetadata.modified_at", + "target": 756, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Model for a request queue metadata." + } + ] + }, + "decorations": [ + { + "args": "('Data structures')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 3581, + 3582, + 765, + 766, + 3579, + 764, + 3583, + 3580, + 767, + 771, + 768, + 769, + 770 + ], + "title": "Properties" + } + ], + "id": 763, + "module": "storage_clients.models", + "name": "RequestQueueMetadata", + "parsedDocstring": { + "text": "Model for a request queue metadata." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 54 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "StorageMetadata", + "target": "750", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 773, + "module": "storage_clients.models", + "name": "model_config", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 72 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 774, + "module": "storage_clients.models", + "name": "key", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 74 + } + ], + "type": { + "name": "str", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 775, + "module": "storage_clients.models", + "name": "value", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 75 + } + ], + "type": { + "name": "KvsValueType", + "type": "reference", + "target": "749" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 776, + "module": "storage_clients.models", + "name": "content_type", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 76 + } + ], + "type": { + "name": "Annotated[str | None, Field(alias='contentType', default=None)]", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 777, + "module": "storage_clients.models", + "name": "filename", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 77 + } + ], + "type": { + "name": "Annotated[str | None, Field(alias='filename', default=None)]", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Model for a key-value store record." + } + ] + }, + "decorations": [ + { + "args": "('Data structures')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 776, + 777, + 774, + 773, + 775 + ], + "title": "Properties" + } + ], + "id": 772, + "module": "storage_clients.models", + "name": "KeyValueStoreRecord", + "parsedDocstring": { + "text": "Model for a key-value store record." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 69 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 779, + "module": "storage_clients.models", + "name": "model_config", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 84 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 780, + "module": "storage_clients.models", + "name": "key", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 86 + } + ], + "type": { + "name": "str", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 781, + "module": "storage_clients.models", + "name": "content_type", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 87 + } + ], + "type": { + "name": "str", + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Model for a key-value store record metadata." + } + ] + }, + "decorations": [ + { + "args": "('Data structures')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 781, + 780, + 779 + ], + "title": "Properties" + } + ], + "id": 778, + "module": "storage_clients.models", + "name": "KeyValueStoreRecordMetadata", + "parsedDocstring": { + "text": "Model for a key-value store record metadata." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 81 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 783, + "module": "storage_clients.models", + "name": "model_config", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 94 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 784, + "module": "storage_clients.models", + "name": "key", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 96 + } + ], + "type": { + "name": "str", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 785, + "module": "storage_clients.models", + "name": "size", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 97 + } + ], + "type": { + "name": "int", + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Model for a key-value store key info." + } + ] + }, + "decorations": [ + { + "args": "('Data structures')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 784, + 783, + 785 + ], + "title": "Properties" + } + ], + "id": 782, + "module": "storage_clients.models", + "name": "KeyValueStoreKeyInfo", + "parsedDocstring": { + "text": "Model for a key-value store key info." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 91 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 787, + "module": "storage_clients.models", + "name": "model_config", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 104 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 788, + "module": "storage_clients.models", + "name": "count", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 106 + } + ], + "type": { + "name": "int", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 789, + "module": "storage_clients.models", + "name": "limit", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 107 + } + ], + "type": { + "name": "int", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 790, + "module": "storage_clients.models", + "name": "is_truncated", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 108 + } + ], + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 791, + "module": "storage_clients.models", + "name": "items", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 109 + } + ], + "type": { + "name": "list", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "KeyValueStoreKeyInfo", + "target": "782" + } + ], + "target": "866" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 792, + "module": "storage_clients.models", + "name": "exclusive_start_key", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 110 + } + ], + "type": { + "name": "Annotated[str | None, Field(alias='exclusiveStartKey', default=None)]", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 793, + "module": "storage_clients.models", + "name": "next_exclusive_start_key", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 111 + } + ], + "type": { + "name": "Annotated[str | None, Field(alias='nextExclusiveStartKey', default=None)]", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Model for listing keys in the key-value store." + } + ] + }, + "decorations": [ + { + "args": "('Data structures')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 788, + 792, + 790, + 791, + 789, + 787, + 793 + ], + "title": "Properties" + } + ], + "id": 786, + "module": "storage_clients.models", + "name": "KeyValueStoreListKeysPage", + "parsedDocstring": { + "text": "Model for listing keys in the key-value store." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 101 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 795, + "module": "storage_clients.models", + "name": "model_config", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 118 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 796, + "module": "storage_clients.models", + "name": "was_limit_reached", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 120 + } + ], + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 797, + "module": "storage_clients.models", + "name": "prev_limit", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 121 + } + ], + "type": { + "name": "int", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 798, + "module": "storage_clients.models", + "name": "queue_modified_at", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 122 + } + ], + "type": { + "name": "datetime", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 799, + "module": "storage_clients.models", + "name": "query_started_at", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 123 + } + ], + "type": { + "name": "datetime", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 800, + "module": "storage_clients.models", + "name": "had_multiple_clients", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 124 + } + ], + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Model for the request queue head state." + } + ] + }, + "decorations": [ + { + "args": "('Data structures')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 800, + 795, + 797, + 799, + 798, + 796 + ], + "title": "Properties" + } + ], + "id": 794, + "module": "storage_clients.models", + "name": "RequestQueueHeadState", + "parsedDocstring": { + "text": "Model for the request queue head state." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 115 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 802, + "module": "storage_clients.models", + "name": "model_config", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 131 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 803, + "module": "storage_clients.models", + "name": "limit", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 133 + } + ], + "type": { + "name": "Annotated[int | None, Field(alias='limit', default=None)]", + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 804, + "module": "storage_clients.models", + "name": "had_multiple_clients", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 134 + } + ], + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 805, + "module": "storage_clients.models", + "name": "queue_modified_at", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 135 + } + ], + "type": { + "name": "datetime", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 806, + "module": "storage_clients.models", + "name": "items", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 136 + } + ], + "type": { + "name": "list", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Request", + "target": "398" + } + ], + "target": "866" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Model for the request queue head." + } + ] + }, + "decorations": [ + { + "args": "('Data structures')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 804, + 806, + 803, + 802, + 805 + ], + "title": "Properties" + } + ], + "id": 801, + "module": "storage_clients.models", + "name": "RequestQueueHead", + "parsedDocstring": { + "text": "Model for the request queue head." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 128 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedBy": [ + { + "name": "RequestQueueHeadWithLocks", + "target": "807", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 808, + "module": "storage_clients.models", + "name": "lock_secs", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 143 + } + ], + "type": { + "name": "int", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 809, + "module": "storage_clients.models", + "name": "queue_has_locked_requests", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 144 + } + ], + "type": { + "name": "Annotated[bool | None, Field(alias='queueHasLockedRequests')]", + "type": "union", + "types": [ + { + "type": "reference", + "name": "bool" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3564, + "module": "storage_clients.models", + "name": "model_config", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 131 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "inheritedFrom": { + "name": "RequestQueueHead.model_config", + "target": 802, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3565, + "module": "storage_clients.models", + "name": "limit", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 133 + } + ], + "type": { + "name": "Annotated[int | None, Field(alias='limit', default=None)]", + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + }, + "inheritedFrom": { + "name": "RequestQueueHead.limit", + "target": 803, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3566, + "module": "storage_clients.models", + "name": "had_multiple_clients", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 134 + } + ], + "type": { + "name": "bool", + "type": "reference" + }, + "inheritedFrom": { + "name": "RequestQueueHead.had_multiple_clients", + "target": 804, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3567, + "module": "storage_clients.models", + "name": "queue_modified_at", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 135 + } + ], + "type": { + "name": "datetime", + "type": "reference" + }, + "inheritedFrom": { + "name": "RequestQueueHead.queue_modified_at", + "target": 805, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3568, + "module": "storage_clients.models", + "name": "items", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 136 + } + ], + "type": { + "name": "list", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Request", + "target": "398" + } + ], + "target": "866" + }, + "inheritedFrom": { + "name": "RequestQueueHead.items", + "target": 806, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Model for request queue head with locks." + } + ] + }, + "decorations": [ + { + "args": "('Data structures')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 3566, + 3568, + 3565, + 808, + 3564, + 809, + 3567 + ], + "title": "Properties" + } + ], + "id": 807, + "module": "storage_clients.models", + "name": "RequestQueueHeadWithLocks", + "parsedDocstring": { + "text": "Model for request queue head with locks." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 140 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "RequestQueueHead", + "target": "801", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 811, + "module": "storage_clients.models", + "name": "model_config", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 150 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The number of objects returned on this page." + } + ] + }, + "flags": {}, + "groups": [], + "id": 812, + "module": "storage_clients.models", + "name": "count", + "parsedDocstring": { + "text": "The number of objects returned on this page." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 152 + } + ], + "type": { + "name": "int", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The starting position of the first object returned, as specified in the API call." + } + ] + }, + "flags": {}, + "groups": [], + "id": 813, + "module": "storage_clients.models", + "name": "offset", + "parsedDocstring": { + "text": "The starting position of the first object returned, as specified in the API call." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 155 + } + ], + "type": { + "name": "int", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The maximum number of objects to return, as specified in the API call." + } + ] + }, + "flags": {}, + "groups": [], + "id": 814, + "module": "storage_clients.models", + "name": "limit", + "parsedDocstring": { + "text": "The maximum number of objects to return, as specified in the API call." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 158 + } + ], + "type": { + "name": "int", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The total number of objects that match the criteria of the API call." + } + ] + }, + "flags": {}, + "groups": [], + "id": 815, + "module": "storage_clients.models", + "name": "total", + "parsedDocstring": { + "text": "The total number of objects that match the criteria of the API call." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 161 + } + ], + "type": { + "name": "int", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Indicates if the returned list is in descending order." + } + ] + }, + "flags": {}, + "groups": [], + "id": 816, + "module": "storage_clients.models", + "name": "desc", + "parsedDocstring": { + "text": "Indicates if the returned list is in descending order." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 164 + } + ], + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Model for a single page of storage items returned from a collection list method." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 812, + 816, + 814, + 811, + 813, + 815 + ], + "title": "Properties" + } + ], + "id": 810, + "module": "storage_clients.models", + "name": "_ListPage", + "parsedDocstring": { + "text": "Model for a single page of storage items returned from a collection list method." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 147 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedBy": [ + { + "name": "DatasetListPage", + "target": "817", + "type": "reference" + }, + { + "name": "KeyValueStoreListPage", + "target": "819", + "type": "reference" + }, + { + "name": "RequestQueueListPage", + "target": "821", + "type": "reference" + }, + { + "name": "DatasetItemsListPage", + "target": "823", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The list of dataset items returned on this page." + } + ] + }, + "flags": {}, + "groups": [], + "id": 818, + "module": "storage_clients.models", + "name": "items", + "parsedDocstring": { + "text": "The list of dataset items returned on this page." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 172 + } + ], + "type": { + "name": "list", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "DatasetMetadata", + "target": "757" + } + ], + "target": "866" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3540, + "module": "storage_clients.models", + "name": "model_config", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 150 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "inheritedFrom": { + "name": "_ListPage.model_config", + "target": 811, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The number of objects returned on this page." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3541, + "module": "storage_clients.models", + "name": "count", + "parsedDocstring": { + "text": "The number of objects returned on this page." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 152 + } + ], + "type": { + "name": "Annotated[int, Field(default=0)]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_ListPage.count", + "target": 812, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The starting position of the first object returned, as specified in the API call." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3542, + "module": "storage_clients.models", + "name": "offset", + "parsedDocstring": { + "text": "The starting position of the first object returned, as specified in the API call." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 155 + } + ], + "type": { + "name": "Annotated[int, Field(default=0)]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_ListPage.offset", + "target": 813, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The maximum number of objects to return, as specified in the API call." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3543, + "module": "storage_clients.models", + "name": "limit", + "parsedDocstring": { + "text": "The maximum number of objects to return, as specified in the API call." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 158 + } + ], + "type": { + "name": "Annotated[int, Field(default=0)]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_ListPage.limit", + "target": 814, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The total number of objects that match the criteria of the API call." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3544, + "module": "storage_clients.models", + "name": "total", + "parsedDocstring": { + "text": "The total number of objects that match the criteria of the API call." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 161 + } + ], + "type": { + "name": "Annotated[int, Field(default=0)]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_ListPage.total", + "target": 815, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Indicates if the returned list is in descending order." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3545, + "module": "storage_clients.models", + "name": "desc", + "parsedDocstring": { + "text": "Indicates if the returned list is in descending order." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 164 + } + ], + "type": { + "name": "Annotated[bool, Field(default=False)]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_ListPage.desc", + "target": 816, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Model for a single page of dataset items returned from a collection list method." + } + ] + }, + "decorations": [ + { + "args": "('Data structures')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 3541, + 3545, + 818, + 3543, + 3540, + 3542, + 3544 + ], + "title": "Properties" + } + ], + "id": 817, + "module": "storage_clients.models", + "name": "DatasetListPage", + "parsedDocstring": { + "text": "Model for a single page of dataset items returned from a collection list method." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 169 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "_ListPage", + "target": "810", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The list of key-value store items returned on this page." + } + ] + }, + "flags": {}, + "groups": [], + "id": 820, + "module": "storage_clients.models", + "name": "items", + "parsedDocstring": { + "text": "The list of key-value store items returned on this page." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 180 + } + ], + "type": { + "name": "list", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "KeyValueStoreMetadata", + "target": "760" + } + ], + "target": "866" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3546, + "module": "storage_clients.models", + "name": "model_config", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 150 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "inheritedFrom": { + "name": "_ListPage.model_config", + "target": 811, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The number of objects returned on this page." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3547, + "module": "storage_clients.models", + "name": "count", + "parsedDocstring": { + "text": "The number of objects returned on this page." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 152 + } + ], + "type": { + "name": "Annotated[int, Field(default=0)]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_ListPage.count", + "target": 812, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The starting position of the first object returned, as specified in the API call." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3548, + "module": "storage_clients.models", + "name": "offset", + "parsedDocstring": { + "text": "The starting position of the first object returned, as specified in the API call." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 155 + } + ], + "type": { + "name": "Annotated[int, Field(default=0)]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_ListPage.offset", + "target": 813, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The maximum number of objects to return, as specified in the API call." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3549, + "module": "storage_clients.models", + "name": "limit", + "parsedDocstring": { + "text": "The maximum number of objects to return, as specified in the API call." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 158 + } + ], + "type": { + "name": "Annotated[int, Field(default=0)]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_ListPage.limit", + "target": 814, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The total number of objects that match the criteria of the API call." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3550, + "module": "storage_clients.models", + "name": "total", + "parsedDocstring": { + "text": "The total number of objects that match the criteria of the API call." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 161 + } + ], + "type": { + "name": "Annotated[int, Field(default=0)]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_ListPage.total", + "target": 815, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Indicates if the returned list is in descending order." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3551, + "module": "storage_clients.models", + "name": "desc", + "parsedDocstring": { + "text": "Indicates if the returned list is in descending order." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 164 + } + ], + "type": { + "name": "Annotated[bool, Field(default=False)]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_ListPage.desc", + "target": 816, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Model for a single page of key-value store items returned from a collection list method." + } + ] + }, + "decorations": [ + { + "args": "('Data structures')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 3547, + 3551, + 820, + 3549, + 3546, + 3548, + 3550 + ], + "title": "Properties" + } + ], + "id": 819, + "module": "storage_clients.models", + "name": "KeyValueStoreListPage", + "parsedDocstring": { + "text": "Model for a single page of key-value store items returned from a collection list method." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 177 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "_ListPage", + "target": "810", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The list of request queue items returned on this page." + } + ] + }, + "flags": {}, + "groups": [], + "id": 822, + "module": "storage_clients.models", + "name": "items", + "parsedDocstring": { + "text": "The list of request queue items returned on this page." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 188 + } + ], + "type": { + "name": "list", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "RequestQueueMetadata", + "target": "763" + } + ], + "target": "866" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3552, + "module": "storage_clients.models", + "name": "model_config", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 150 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "inheritedFrom": { + "name": "_ListPage.model_config", + "target": 811, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The number of objects returned on this page." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3553, + "module": "storage_clients.models", + "name": "count", + "parsedDocstring": { + "text": "The number of objects returned on this page." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 152 + } + ], + "type": { + "name": "Annotated[int, Field(default=0)]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_ListPage.count", + "target": 812, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The starting position of the first object returned, as specified in the API call." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3554, + "module": "storage_clients.models", + "name": "offset", + "parsedDocstring": { + "text": "The starting position of the first object returned, as specified in the API call." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 155 + } + ], + "type": { + "name": "Annotated[int, Field(default=0)]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_ListPage.offset", + "target": 813, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The maximum number of objects to return, as specified in the API call." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3555, + "module": "storage_clients.models", + "name": "limit", + "parsedDocstring": { + "text": "The maximum number of objects to return, as specified in the API call." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 158 + } + ], + "type": { + "name": "Annotated[int, Field(default=0)]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_ListPage.limit", + "target": 814, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The total number of objects that match the criteria of the API call." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3556, + "module": "storage_clients.models", + "name": "total", + "parsedDocstring": { + "text": "The total number of objects that match the criteria of the API call." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 161 + } + ], + "type": { + "name": "Annotated[int, Field(default=0)]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_ListPage.total", + "target": 815, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Indicates if the returned list is in descending order." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3557, + "module": "storage_clients.models", + "name": "desc", + "parsedDocstring": { + "text": "Indicates if the returned list is in descending order." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 164 + } + ], + "type": { + "name": "Annotated[bool, Field(default=False)]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_ListPage.desc", + "target": 816, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Model for a single page of request queue items returned from a collection list method." + } + ] + }, + "decorations": [ + { + "args": "('Data structures')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 3553, + 3557, + 822, + 3555, + 3552, + 3554, + 3556 + ], + "title": "Properties" + } + ], + "id": 821, + "module": "storage_clients.models", + "name": "RequestQueueListPage", + "parsedDocstring": { + "text": "Model for a single page of request queue items returned from a collection list method." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 185 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "_ListPage", + "target": "810", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The list of dataset items returned on this page." + } + ] + }, + "flags": {}, + "groups": [], + "id": 824, + "module": "storage_clients.models", + "name": "items", + "parsedDocstring": { + "text": "The list of dataset items returned on this page." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 196 + } + ], + "type": { + "name": "list", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "dict" + } + ], + "target": "866" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3558, + "module": "storage_clients.models", + "name": "model_config", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 150 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "inheritedFrom": { + "name": "_ListPage.model_config", + "target": 811, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The number of objects returned on this page." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3559, + "module": "storage_clients.models", + "name": "count", + "parsedDocstring": { + "text": "The number of objects returned on this page." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 152 + } + ], + "type": { + "name": "Annotated[int, Field(default=0)]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_ListPage.count", + "target": 812, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The starting position of the first object returned, as specified in the API call." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3560, + "module": "storage_clients.models", + "name": "offset", + "parsedDocstring": { + "text": "The starting position of the first object returned, as specified in the API call." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 155 + } + ], + "type": { + "name": "Annotated[int, Field(default=0)]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_ListPage.offset", + "target": 813, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The maximum number of objects to return, as specified in the API call." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3561, + "module": "storage_clients.models", + "name": "limit", + "parsedDocstring": { + "text": "The maximum number of objects to return, as specified in the API call." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 158 + } + ], + "type": { + "name": "Annotated[int, Field(default=0)]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_ListPage.limit", + "target": 814, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The total number of objects that match the criteria of the API call." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3562, + "module": "storage_clients.models", + "name": "total", + "parsedDocstring": { + "text": "The total number of objects that match the criteria of the API call." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 161 + } + ], + "type": { + "name": "Annotated[int, Field(default=0)]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_ListPage.total", + "target": 815, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Indicates if the returned list is in descending order." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3563, + "module": "storage_clients.models", + "name": "desc", + "parsedDocstring": { + "text": "Indicates if the returned list is in descending order." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 164 + } + ], + "type": { + "name": "Annotated[bool, Field(default=False)]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_ListPage.desc", + "target": 816, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Model for a single page of dataset items returned from a collection list method." + } + ] + }, + "decorations": [ + { + "args": "('Data structures')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 3559, + 3563, + 824, + 3561, + 3558, + 3560, + 3562 + ], + "title": "Properties" + } + ], + "id": 823, + "module": "storage_clients.models", + "name": "DatasetItemsListPage", + "parsedDocstring": { + "text": "Model for a single page of dataset items returned from a collection list method." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 193 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "_ListPage", + "target": "810", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 826, + "module": "storage_clients.models", + "name": "model_config", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 204 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 827, + "module": "storage_clients.models", + "name": "lock_expires_at", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 206 + } + ], + "type": { + "name": "datetime", + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Response to prolong request lock calls." + } + ] + }, + "decorations": [ + { + "args": "('Data structures')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 827, + 826 + ], + "title": "Properties" + } + ], + "id": 825, + "module": "storage_clients.models", + "name": "ProlongRequestLockResponse", + "parsedDocstring": { + "text": "Response to prolong request lock calls." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 201 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 829, + "module": "storage_clients.models", + "name": "model_config", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 213 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 830, + "module": "storage_clients.models", + "name": "id", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 215 + } + ], + "type": { + "name": "str", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 831, + "module": "storage_clients.models", + "name": "unique_key", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 216 + } + ], + "type": { + "name": "str", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 832, + "module": "storage_clients.models", + "name": "was_already_present", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 217 + } + ], + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 833, + "module": "storage_clients.models", + "name": "was_already_handled", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 218 + } + ], + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Represents a processed request." + } + ] + }, + "decorations": [ + { + "args": "('Data structures')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 830, + 829, + 831, + 833, + 832 + ], + "title": "Properties" + } + ], + "id": 828, + "module": "storage_clients.models", + "name": "ProcessedRequest", + "parsedDocstring": { + "text": "Represents a processed request." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 210 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 835, + "module": "storage_clients.models", + "name": "model_config", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 225 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 836, + "module": "storage_clients.models", + "name": "unique_key", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 227 + } + ], + "type": { + "name": "str", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 837, + "module": "storage_clients.models", + "name": "url", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 228 + } + ], + "type": { + "name": "str", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 838, + "module": "storage_clients.models", + "name": "method", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 229 + } + ], + "type": { + "name": "Annotated[HttpMethod | None, Field()]", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Literal['GET', 'HEAD', 'POST', 'PUT', 'DELETE', 'CONNECT', 'OPTIONS', 'TRACE', 'PATCH']" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Represents an unprocessed request." + } + ] + }, + "decorations": [ + { + "args": "('Data structures')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 838, + 835, + 836, + 837 + ], + "title": "Properties" + } + ], + "id": 834, + "module": "storage_clients.models", + "name": "UnprocessedRequest", + "parsedDocstring": { + "text": "Represents an unprocessed request." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 222 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 840, + "module": "storage_clients.models", + "name": "model_config", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 236 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 841, + "module": "storage_clients.models", + "name": "processed_requests", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 238 + } + ], + "type": { + "name": "list", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "ProcessedRequest", + "target": "828" + } + ], + "target": "866" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 842, + "module": "storage_clients.models", + "name": "unprocessed_requests", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 239 + } + ], + "type": { + "name": "list", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "UnprocessedRequest", + "target": "834" + } + ], + "target": "866" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Response to batch request deletion calls." + } + ] + }, + "decorations": [ + { + "args": "('Data structures')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 840, + 841, + 842 + ], + "title": "Properties" + } + ], + "id": 839, + "module": "storage_clients.models", + "name": "BatchRequestsOperationResponse", + "parsedDocstring": { + "text": "Response to batch request deletion calls." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 233 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 844, + "module": "storage_clients.models", + "name": "model_config", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 245 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 845, + "module": "storage_clients.models", + "name": "id", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 247 + } + ], + "type": { + "name": "str", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 846, + "module": "storage_clients.models", + "name": "unique_key", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 249 + } + ], + "type": { + "name": "str", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Order number for maintaining request sequence in queue.\nUsed for restoring correct request order when recovering queue from storage." + } + ] + }, + "flags": {}, + "groups": [], + "id": 847, + "module": "storage_clients.models", + "name": "order_no", + "parsedDocstring": { + "text": "Order number for maintaining request sequence in queue.\nUsed for restoring correct request order when recovering queue from storage." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 251 + } + ], + "type": { + "name": "Decimal | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Decimal" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 848, + "module": "storage_clients.models", + "name": "handled_at", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 255 + } + ], + "type": { + "name": "datetime | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "datetime" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Original Request object. The alias 'json_' is required for backward compatibility with legacy code." + } + ] + }, + "flags": {}, + "groups": [], + "id": 849, + "module": "storage_clients.models", + "name": "request", + "parsedDocstring": { + "text": "Original Request object. The alias 'json_' is required for backward compatibility with legacy code." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 257 + } + ], + "type": { + "name": "Request", + "type": "reference", + "target": "398" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create an internal request from a `Request` object." + } + ] + }, + "decorations": [ + { + "name": "classmethod" + } + ], + "flags": {}, + "groups": [], + "id": 850, + "module": "storage_clients.models", + "name": "from_request", + "parsedDocstring": { + "text": "Create an internal request from a `Request` object." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 265 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create an internal request from a `Request` object." + } + ] + }, + "flags": {}, + "id": 851, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "from_request", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 852, + "kind": 32768, + "kindString": "Parameter", + "name": "request", + "type": { + "name": "Request", + "type": "reference", + "target": "398" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 853, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 854, + "kind": 32768, + "kindString": "Parameter", + "name": "order_no", + "type": { + "name": "Decimal | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Decimal" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "InternalRequest", + "type": "reference", + "target": "843" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Convert the internal request back to a `Request` object." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 855, + "module": "storage_clients.models", + "name": "to_request", + "parsedDocstring": { + "text": "Convert the internal request back to a `Request` object." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 275 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Convert the internal request back to a `Request` object." + } + ] + }, + "flags": {}, + "id": 856, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "to_request", + "parameters": [], + "type": { + "name": "Request", + "type": "reference", + "target": "398" + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Internal representation of a queue request with additional metadata for ordering and storage." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 850, + 855 + ], + "title": "Methods" + }, + { + "children": [ + 848, + 845, + 844, + 847, + 849, + 846 + ], + "title": "Properties" + } + ], + "id": 843, + "module": "storage_clients.models", + "name": "InternalRequest", + "parsedDocstring": { + "text": "Internal representation of a queue request with additional metadata for ordering and storage." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 242 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 858, + "module": "storage_clients._memory._request_queue_collection_client", + "name": "__init__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_memory/_request_queue_collection_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 20 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 859, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 860, + "kind": 32768, + "kindString": "Parameter", + "name": "memory_storage_client", + "type": { + "name": "MemoryStorageClient", + "type": "reference", + "target": "936" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "name": "override" + } + ], + "flags": {}, + "groups": [], + "id": 861, + "module": "storage_clients._memory._request_queue_collection_client", + "name": "get_or_create", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_memory/_request_queue_collection_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 28 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 862, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_or_create", + "parameters": [ + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 863, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 864, + "kind": 32768, + "kindString": "Parameter", + "name": "schema", + "type": { + "name": "dict | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "dict" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 865, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "RequestQueueMetadata", + "type": "reference", + "target": "763" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "name": "override" + } + ], + "flags": {}, + "groups": [], + "id": 866, + "module": "storage_clients._memory._request_queue_collection_client", + "name": "list", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_memory/_request_queue_collection_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 45 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 867, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "list", + "parameters": [ + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 868, + "kind": 32768, + "kindString": "Parameter", + "name": "unnamed", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 869, + "kind": 32768, + "kindString": "Parameter", + "name": "limit", + "type": { + "name": "int | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 870, + "kind": 32768, + "kindString": "Parameter", + "name": "offset", + "type": { + "name": "int | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 871, + "kind": 32768, + "kindString": "Parameter", + "name": "desc", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "RequestQueueListPage", + "type": "reference", + "target": "821" + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Subclient for manipulating request queues." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 858, + 861, + 866 + ], + "title": "Methods" + } + ], + "id": 857, + "module": "storage_clients._memory._request_queue_collection_client", + "name": "RequestQueueCollectionClient", + "parsedDocstring": { + "text": "Subclient for manipulating request queues." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_memory/_request_queue_collection_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 17 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 872, + "module": "storage_clients._memory._request_queue_client", + "name": "logger", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_memory/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 42 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 874, + "module": "storage_clients._memory._request_queue_client", + "name": "__init__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_memory/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 48 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 875, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 876, + "kind": 32768, + "kindString": "Parameter", + "name": "memory_storage_client", + "type": { + "name": "MemoryStorageClient", + "type": "reference", + "target": "936" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 877, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 878, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 879, + "kind": 32768, + "kindString": "Parameter", + "name": "created_at", + "type": { + "name": "datetime | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "datetime" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 880, + "kind": 32768, + "kindString": "Parameter", + "name": "accessed_at", + "type": { + "name": "datetime | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "datetime" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 881, + "kind": 32768, + "kindString": "Parameter", + "name": "modified_at", + "type": { + "name": "datetime | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "datetime" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "0", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 882, + "kind": 32768, + "kindString": "Parameter", + "name": "handled_request_count", + "type": { + "name": "int", + "type": "reference" + } + }, + { + "defaultValue": "0", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 883, + "kind": 32768, + "kindString": "Parameter", + "name": "pending_request_count", + "type": { + "name": "int", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the resource info for the request queue client." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 884, + "module": "storage_clients._memory._request_queue_client", + "name": "resource_info", + "parsedDocstring": { + "text": "Get the resource info for the request queue client." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_memory/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 78 + } + ], + "type": { + "name": "RequestQueueMetadata", + "type": "reference", + "target": "763" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the resource directory for the client." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 885, + "module": "storage_clients._memory._request_queue_client", + "name": "resource_directory", + "parsedDocstring": { + "text": "Get the resource directory for the client." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_memory/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 96 + } + ], + "type": { + "name": "str", + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "name": "override" + } + ], + "flags": {}, + "groups": [], + "id": 886, + "module": "storage_clients._memory._request_queue_client", + "name": "get", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_memory/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 101 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 887, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get", + "parameters": [], + "type": { + "name": "RequestQueueMetadata | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "RequestQueueMetadata", + "target": "763" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "name": "override" + } + ], + "flags": {}, + "groups": [], + "id": 888, + "module": "storage_clients._memory._request_queue_client", + "name": "update", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_memory/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 117 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 889, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "update", + "parameters": [ + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 890, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "RequestQueueMetadata", + "type": "reference", + "target": "763" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "name": "override" + } + ], + "flags": {}, + "groups": [], + "id": 891, + "module": "storage_clients._memory._request_queue_client", + "name": "delete", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_memory/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 158 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 892, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "delete", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "name": "override" + } + ], + "flags": {}, + "groups": [], + "id": 893, + "module": "storage_clients._memory._request_queue_client", + "name": "list_head", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_memory/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 175 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 894, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "list_head", + "parameters": [ + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 895, + "kind": 32768, + "kindString": "Parameter", + "name": "limit", + "type": { + "name": "int | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 896, + "kind": 32768, + "kindString": "Parameter", + "name": "skip_in_progress", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "RequestQueueHead", + "type": "reference", + "target": "801" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "name": "override" + } + ], + "flags": {}, + "groups": [], + "id": 897, + "module": "storage_clients._memory._request_queue_client", + "name": "list_and_lock_head", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_memory/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 216 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 898, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "list_and_lock_head", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 899, + "kind": 32768, + "kindString": "Parameter", + "name": "lock_secs", + "type": { + "name": "int", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 900, + "kind": 32768, + "kindString": "Parameter", + "name": "limit", + "type": { + "name": "int | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "RequestQueueHeadWithLocks", + "type": "reference", + "target": "807" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "name": "override" + } + ], + "flags": {}, + "groups": [], + "id": 901, + "module": "storage_clients._memory._request_queue_client", + "name": "add_request", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_memory/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 242 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 902, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "add_request", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 903, + "kind": 32768, + "kindString": "Parameter", + "name": "request", + "type": { + "name": "Request", + "type": "reference", + "target": "398" + } + }, + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 904, + "kind": 32768, + "kindString": "Parameter", + "name": "forefront", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "ProcessedRequest", + "type": "reference", + "target": "828" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "name": "override" + } + ], + "flags": {}, + "groups": [], + "id": 905, + "module": "storage_clients._memory._request_queue_client", + "name": "get_request", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_memory/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 296 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 906, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_request", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 907, + "kind": 32768, + "kindString": "Parameter", + "name": "request_id", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "Request | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Request", + "target": "398" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "name": "override" + } + ], + "flags": {}, + "groups": [], + "id": 908, + "module": "storage_clients._memory._request_queue_client", + "name": "update_request", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_memory/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 314 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 909, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "update_request", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 910, + "kind": 32768, + "kindString": "Parameter", + "name": "request", + "type": { + "name": "Request", + "type": "reference", + "target": "398" + } + }, + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 911, + "kind": 32768, + "kindString": "Parameter", + "name": "forefront", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "ProcessedRequest", + "type": "reference", + "target": "828" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "name": "override" + } + ], + "flags": {}, + "groups": [], + "id": 912, + "module": "storage_clients._memory._request_queue_client", + "name": "delete_request", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_memory/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 374 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 913, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "delete_request", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 914, + "kind": 32768, + "kindString": "Parameter", + "name": "request_id", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "name": "override" + } + ], + "flags": {}, + "groups": [], + "id": 915, + "module": "storage_clients._memory._request_queue_client", + "name": "prolong_request_lock", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_memory/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 401 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 916, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "prolong_request_lock", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 917, + "kind": 32768, + "kindString": "Parameter", + "name": "request_id", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 918, + "kind": 32768, + "kindString": "Parameter", + "name": "forefront", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 919, + "kind": 32768, + "kindString": "Parameter", + "name": "lock_secs", + "type": { + "name": "int", + "type": "reference" + } + } + ], + "type": { + "name": "ProlongRequestLockResponse", + "type": "reference", + "target": "825" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "name": "override" + } + ], + "flags": {}, + "groups": [], + "id": 920, + "module": "storage_clients._memory._request_queue_client", + "name": "delete_request_lock", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_memory/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 411 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 921, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "delete_request_lock", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 922, + "kind": 32768, + "kindString": "Parameter", + "name": "request_id", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 923, + "kind": 32768, + "kindString": "Parameter", + "name": "forefront", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "name": "override" + } + ], + "flags": {}, + "groups": [], + "id": 924, + "module": "storage_clients._memory._request_queue_client", + "name": "batch_add_requests", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_memory/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 430 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 925, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "batch_add_requests", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 926, + "kind": 32768, + "kindString": "Parameter", + "name": "requests", + "type": { + "name": "Sequence", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Request", + "target": "398" + } + ] + } + }, + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 927, + "kind": 32768, + "kindString": "Parameter", + "name": "forefront", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "BatchRequestsOperationResponse", + "type": "reference", + "target": "839" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "name": "override" + } + ], + "flags": {}, + "groups": [], + "id": 928, + "module": "storage_clients._memory._request_queue_client", + "name": "batch_delete_requests", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_memory/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 466 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 929, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "batch_delete_requests", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 930, + "kind": 32768, + "kindString": "Parameter", + "name": "requests", + "type": { + "name": "list", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Request", + "target": "398" + } + ], + "target": "866" + } + } + ], + "type": { + "name": "BatchRequestsOperationResponse", + "type": "reference", + "target": "839" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Update the timestamps of the request queue." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 931, + "module": "storage_clients._memory._request_queue_client", + "name": "update_timestamps", + "parsedDocstring": { + "text": "Update the timestamps of the request queue." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_memory/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 469 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Update the timestamps of the request queue." + } + ] + }, + "flags": {}, + "id": 932, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "update_timestamps", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 933, + "kind": 32768, + "kindString": "Parameter", + "name": "has_been_modified", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Subclient for manipulating a single request queue." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 874, + 901, + 924, + 928, + 891, + 912, + 920, + 886, + 905, + 897, + 893, + 915, + 888, + 908, + 931 + ], + "title": "Methods" + }, + { + "children": [ + 885, + 884 + ], + "title": "Properties" + } + ], + "id": 873, + "module": "storage_clients._memory._request_queue_client", + "name": "RequestQueueClient", + "parsedDocstring": { + "text": "Subclient for manipulating a single request queue." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_memory/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 45 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 934, + "module": "storage_clients._memory._memory_storage_client", + "name": "TResourceClient", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_memory/_memory_storage_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 28 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 935, + "module": "storage_clients._memory._memory_storage_client", + "name": "logger", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_memory/_memory_storage_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 30 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n\nIn most cases, you should use the `from_config` constructor to create a new instance based on\nthe provided configuration.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 937, + "module": "storage_clients._memory._memory_storage_client", + "name": "__init__", + "parsedDocstring": { + "text": "Initialize a new instance.\n\nIn most cases, you should use the `from_config` constructor to create a new instance based on\nthe provided configuration.\n", + "args": { + "write_metadata": "Whether to write metadata to the storage.", + "persist_storage": "Whether to persist the storage.", + "storage_dir": "Path to the storage directory.", + "default_request_queue_id": "The default request queue ID.", + "default_key_value_store_id": "The default key-value store ID.", + "default_dataset_id": "The default dataset ID." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_memory/_memory_storage_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 59 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n\nIn most cases, you should use the `from_config` constructor to create a new instance based on\nthe provided configuration.\n" + } + ] + }, + "flags": {}, + "id": 938, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Whether to write metadata to the storage." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 939, + "kind": 32768, + "kindString": "Parameter", + "name": "write_metadata", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Whether to persist the storage." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 940, + "kind": 32768, + "kindString": "Parameter", + "name": "persist_storage", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Path to the storage directory." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 941, + "kind": 32768, + "kindString": "Parameter", + "name": "storage_dir", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The default request queue ID." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 942, + "kind": 32768, + "kindString": "Parameter", + "name": "default_request_queue_id", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The default key-value store ID." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 943, + "kind": 32768, + "kindString": "Parameter", + "name": "default_key_value_store_id", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The default dataset ID." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 944, + "kind": 32768, + "kindString": "Parameter", + "name": "default_dataset_id", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance based on the provided `Configuration`.\n" + } + ] + }, + "decorations": [ + { + "name": "classmethod" + } + ], + "flags": {}, + "groups": [], + "id": 945, + "module": "storage_clients._memory._memory_storage_client", + "name": "from_config", + "parsedDocstring": { + "text": "Initialize a new instance based on the provided `Configuration`.\n", + "args": { + "config": "The `Configuration` instance. Uses the global (default) one if not provided." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_memory/_memory_storage_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 98 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance based on the provided `Configuration`.\n" + } + ] + }, + "flags": {}, + "id": 946, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "from_config", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The `Configuration` instance. Uses the global (default) one if not provided." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 947, + "kind": 32768, + "kindString": "Parameter", + "name": "config", + "type": { + "name": "Configuration | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Configuration", + "target": "93" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "MemoryStorageClient", + "type": "reference", + "target": "936" + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Whether to write metadata to the storage." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 948, + "module": "storage_clients._memory._memory_storage_client", + "name": "write_metadata", + "parsedDocstring": { + "text": "Whether to write metadata to the storage." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_memory/_memory_storage_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 116 + } + ], + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Whether to persist the storage." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 949, + "module": "storage_clients._memory._memory_storage_client", + "name": "persist_storage", + "parsedDocstring": { + "text": "Whether to persist the storage." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_memory/_memory_storage_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 121 + } + ], + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Path to the storage directory." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 950, + "module": "storage_clients._memory._memory_storage_client", + "name": "storage_dir", + "parsedDocstring": { + "text": "Path to the storage directory." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_memory/_memory_storage_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 126 + } + ], + "type": { + "name": "str", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Path to the directory containing datasets." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 951, + "module": "storage_clients._memory._memory_storage_client", + "name": "datasets_directory", + "parsedDocstring": { + "text": "Path to the directory containing datasets." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_memory/_memory_storage_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 131 + } + ], + "type": { + "name": "str", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Path to the directory containing key-value stores." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 952, + "module": "storage_clients._memory._memory_storage_client", + "name": "key_value_stores_directory", + "parsedDocstring": { + "text": "Path to the directory containing key-value stores." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_memory/_memory_storage_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 136 + } + ], + "type": { + "name": "str", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Path to the directory containing request queues." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 953, + "module": "storage_clients._memory._memory_storage_client", + "name": "request_queues_directory", + "parsedDocstring": { + "text": "Path to the directory containing request queues." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_memory/_memory_storage_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 141 + } + ], + "type": { + "name": "str", + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get a subclient for a specific dataset by its ID." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 954, + "module": "storage_clients._base._storage_client", + "name": "dataset", + "parsedDocstring": { + "text": "Get a subclient for a specific dataset by its ID." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_memory/_memory_storage_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 146 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get a subclient for a specific dataset by its ID." + } + ] + }, + "flags": {}, + "id": 1191, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "dataset", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1192, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "DatasetClient", + "type": "reference", + "target": "1062" + }, + "overwrites": { + "name": "StorageClient.dataset", + "target": 1190, + "type": "reference" + } + } + ], + "overwrites": { + "name": "StorageClient.dataset", + "target": 1190, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get a subclient for dataset collection operations." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 957, + "module": "storage_clients._base._storage_client", + "name": "datasets", + "parsedDocstring": { + "text": "Get a subclient for dataset collection operations." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_memory/_memory_storage_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 150 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get a subclient for dataset collection operations." + } + ] + }, + "flags": {}, + "id": 1194, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "datasets", + "parameters": [], + "type": { + "name": "DatasetCollectionClient", + "type": "reference", + "target": "1046" + }, + "overwrites": { + "name": "StorageClient.datasets", + "target": 1193, + "type": "reference" + } + } + ], + "overwrites": { + "name": "StorageClient.datasets", + "target": 1193, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get a subclient for a specific key-value store by its ID." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 959, + "module": "storage_clients._base._storage_client", + "name": "key_value_store", + "parsedDocstring": { + "text": "Get a subclient for a specific key-value store by its ID." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_memory/_memory_storage_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 154 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get a subclient for a specific key-value store by its ID." + } + ] + }, + "flags": {}, + "id": 1196, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "key_value_store", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1197, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "KeyValueStoreClient", + "type": "reference", + "target": "995" + }, + "overwrites": { + "name": "StorageClient.key_value_store", + "target": 1195, + "type": "reference" + } + } + ], + "overwrites": { + "name": "StorageClient.key_value_store", + "target": 1195, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get a subclient for key-value store collection operations." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 962, + "module": "storage_clients._base._storage_client", + "name": "key_value_stores", + "parsedDocstring": { + "text": "Get a subclient for key-value store collection operations." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_memory/_memory_storage_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 158 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get a subclient for key-value store collection operations." + } + ] + }, + "flags": {}, + "id": 1199, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "key_value_stores", + "parameters": [], + "type": { + "name": "KeyValueStoreCollectionClient", + "type": "reference", + "target": "979" + }, + "overwrites": { + "name": "StorageClient.key_value_stores", + "target": 1198, + "type": "reference" + } + } + ], + "overwrites": { + "name": "StorageClient.key_value_stores", + "target": 1198, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get a subclient for a specific request queue by its ID." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 964, + "module": "storage_clients._base._storage_client", + "name": "request_queue", + "parsedDocstring": { + "text": "Get a subclient for a specific request queue by its ID." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_memory/_memory_storage_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 162 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get a subclient for a specific request queue by its ID." + } + ] + }, + "flags": {}, + "id": 1201, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "request_queue", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1202, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "RequestQueueClient", + "type": "reference", + "target": "873" + }, + "overwrites": { + "name": "StorageClient.request_queue", + "target": 1200, + "type": "reference" + } + } + ], + "overwrites": { + "name": "StorageClient.request_queue", + "target": 1200, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get a subclient for request queue collection operations." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 967, + "module": "storage_clients._base._storage_client", + "name": "request_queues", + "parsedDocstring": { + "text": "Get a subclient for request queue collection operations." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_memory/_memory_storage_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 166 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get a subclient for request queue collection operations." + } + ] + }, + "flags": {}, + "id": 1204, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "request_queues", + "parameters": [], + "type": { + "name": "RequestQueueCollectionClient", + "type": "reference", + "target": "857" + }, + "overwrites": { + "name": "StorageClient.request_queues", + "target": 1203, + "type": "reference" + } + } + ], + "overwrites": { + "name": "StorageClient.request_queues", + "target": 1203, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Perform a purge of the default storages.\n\nThis method ensures that the purge is executed only once during the lifetime of the instance.\nIt is primarily used to clean up residual data from previous runs to maintain a clean state.\nIf the storage client does not support purging, leave it empty." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 969, + "module": "storage_clients._base._storage_client", + "name": "purge_on_start", + "parsedDocstring": { + "text": "Perform a purge of the default storages.\n\nThis method ensures that the purge is executed only once during the lifetime of the instance.\nIt is primarily used to clean up residual data from previous runs to maintain a clean state.\nIf the storage client does not support purging, leave it empty." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_memory/_memory_storage_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 170 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Perform a purge of the default storages.\n\nThis method ensures that the purge is executed only once during the lifetime of the instance.\nIt is primarily used to clean up residual data from previous runs to maintain a clean state.\nIf the storage client does not support purging, leave it empty." + } + ] + }, + "flags": {}, + "id": 1206, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "purge_on_start", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "StorageClient.purge_on_start", + "target": 1205, + "type": "reference" + } + } + ], + "overwrites": { + "name": "StorageClient.purge_on_start", + "target": 1205, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Try to return a resource client from the internal cache." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 971, + "module": "storage_clients._memory._memory_storage_client", + "name": "get_cached_resource_client", + "parsedDocstring": { + "text": "Try to return a resource client from the internal cache." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_memory/_memory_storage_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 185 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Try to return a resource client from the internal cache." + } + ] + }, + "flags": {}, + "id": 972, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "get_cached_resource_client", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 973, + "kind": 32768, + "kindString": "Parameter", + "name": "resource_client_class", + "type": { + "name": "type", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "TResourceClient", + "target": "934" + } + ] + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 974, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 975, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "TResourceClient | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "TResourceClient", + "target": "934" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Add a new resource client to the internal cache." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 976, + "module": "storage_clients._memory._memory_storage_client", + "name": "add_resource_client_to_cache", + "parsedDocstring": { + "text": "Add a new resource client to the internal cache." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_memory/_memory_storage_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 209 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Add a new resource client to the internal cache." + } + ] + }, + "flags": {}, + "id": 977, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "add_resource_client_to_cache", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 978, + "kind": 32768, + "kindString": "Parameter", + "name": "resource_client", + "type": { + "name": "ResourceClient", + "type": "reference", + "target": "1187" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return statistics about rate limit errors encountered by the HTTP client in storage client." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3539, + "module": "storage_clients._base._storage_client", + "name": "get_rate_limit_errors", + "parsedDocstring": { + "text": "Return statistics about rate limit errors encountered by the HTTP client in storage client." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_base/_storage_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 60 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return statistics about rate limit errors encountered by the HTTP client in storage client." + } + ] + }, + "flags": {}, + "id": 1208, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "get_rate_limit_errors", + "parameters": [], + "type": { + "name": "dict[int, int]", + "type": "reference" + }, + "inheritedFrom": { + "name": "StorageClient.get_rate_limit_errors", + "target": 1207, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "StorageClient.get_rate_limit_errors", + "target": 1207, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Represents an in-memory storage client for managing datasets, key-value stores, and request queues.\n\nIt emulates in-memory storage similar to the Apify platform, supporting both in-memory and local file system-based\npersistence.\n\nThe behavior of the storage, such as data persistence and metadata writing, can be customized via initialization\nparameters or environment variables." + } + ] + }, + "decorations": [ + { + "args": "('Classes')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 937, + 976, + 954, + 957, + 945, + 971, + 3539, + 959, + 962, + 969, + 964, + 967 + ], + "title": "Methods" + }, + { + "children": [ + 951, + 952, + 949, + 953, + 950, + 948 + ], + "title": "Properties" + } + ], + "id": 936, + "module": "storage_clients._memory._memory_storage_client", + "name": "MemoryStorageClient", + "parsedDocstring": { + "text": "Represents an in-memory storage client for managing datasets, key-value stores, and request queues.\n\nIt emulates in-memory storage similar to the Apify platform, supporting both in-memory and local file system-based\npersistence.\n\nThe behavior of the storage, such as data persistence and metadata writing, can be customized via initialization\nparameters or environment variables." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_memory/_memory_storage_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 34 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "StorageClient", + "target": "1189", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 980, + "module": "storage_clients._memory._key_value_store_collection_client", + "name": "__init__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_memory/_key_value_store_collection_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 20 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 981, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 982, + "kind": 32768, + "kindString": "Parameter", + "name": "memory_storage_client", + "type": { + "name": "MemoryStorageClient", + "type": "reference", + "target": "936" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "name": "override" + } + ], + "flags": {}, + "groups": [], + "id": 983, + "module": "storage_clients._memory._key_value_store_collection_client", + "name": "get_or_create", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_memory/_key_value_store_collection_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 28 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 984, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_or_create", + "parameters": [ + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 985, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 986, + "kind": 32768, + "kindString": "Parameter", + "name": "schema", + "type": { + "name": "dict | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "dict" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 987, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "KeyValueStoreMetadata", + "type": "reference", + "target": "760" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "name": "override" + } + ], + "flags": {}, + "groups": [], + "id": 988, + "module": "storage_clients._memory._key_value_store_collection_client", + "name": "list", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_memory/_key_value_store_collection_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 45 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 989, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "list", + "parameters": [ + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 990, + "kind": 32768, + "kindString": "Parameter", + "name": "unnamed", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 991, + "kind": 32768, + "kindString": "Parameter", + "name": "limit", + "type": { + "name": "int | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 992, + "kind": 32768, + "kindString": "Parameter", + "name": "offset", + "type": { + "name": "int | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 993, + "kind": 32768, + "kindString": "Parameter", + "name": "desc", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "KeyValueStoreListPage", + "type": "reference", + "target": "819" + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Subclient for manipulating key-value stores." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 980, + 983, + 988 + ], + "title": "Methods" + } + ], + "id": 979, + "module": "storage_clients._memory._key_value_store_collection_client", + "name": "KeyValueStoreCollectionClient", + "parsedDocstring": { + "text": "Subclient for manipulating key-value stores." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_memory/_key_value_store_collection_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 17 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 994, + "module": "storage_clients._memory._key_value_store_client", + "name": "logger", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_memory/_key_value_store_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 35 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 996, + "module": "storage_clients._memory._key_value_store_client", + "name": "__init__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_memory/_key_value_store_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 41 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 997, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 998, + "kind": 32768, + "kindString": "Parameter", + "name": "memory_storage_client", + "type": { + "name": "MemoryStorageClient", + "type": "reference", + "target": "936" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 999, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1000, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1001, + "kind": 32768, + "kindString": "Parameter", + "name": "created_at", + "type": { + "name": "datetime | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "datetime" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1002, + "kind": 32768, + "kindString": "Parameter", + "name": "accessed_at", + "type": { + "name": "datetime | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "datetime" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1003, + "kind": 32768, + "kindString": "Parameter", + "name": "modified_at", + "type": { + "name": "datetime | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "datetime" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the resource info for the key-value store client." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 1004, + "module": "storage_clients._memory._key_value_store_client", + "name": "resource_info", + "parsedDocstring": { + "text": "Get the resource info for the key-value store client." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_memory/_key_value_store_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 63 + } + ], + "type": { + "name": "KeyValueStoreMetadata", + "type": "reference", + "target": "760" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the resource directory for the client." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 1005, + "module": "storage_clients._memory._key_value_store_client", + "name": "resource_directory", + "parsedDocstring": { + "text": "Get the resource directory for the client." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_memory/_key_value_store_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 75 + } + ], + "type": { + "name": "str", + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "name": "override" + } + ], + "flags": {}, + "groups": [], + "id": 1006, + "module": "storage_clients._memory._key_value_store_client", + "name": "get", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_memory/_key_value_store_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 80 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 1007, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get", + "parameters": [], + "type": { + "name": "KeyValueStoreMetadata | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "KeyValueStoreMetadata", + "target": "760" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "name": "override" + } + ], + "flags": {}, + "groups": [], + "id": 1008, + "module": "storage_clients._memory._key_value_store_client", + "name": "update", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_memory/_key_value_store_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 96 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 1009, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "update", + "parameters": [ + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1010, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "KeyValueStoreMetadata", + "type": "reference", + "target": "760" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "name": "override" + } + ], + "flags": {}, + "groups": [], + "id": 1011, + "module": "storage_clients._memory._key_value_store_client", + "name": "delete", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_memory/_key_value_store_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 137 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 1012, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "delete", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "name": "override" + } + ], + "flags": {}, + "groups": [], + "id": 1013, + "module": "storage_clients._memory._key_value_store_client", + "name": "list_keys", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_memory/_key_value_store_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 151 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 1014, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "list_keys", + "parameters": [ + { + "defaultValue": "1000", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1015, + "kind": 32768, + "kindString": "Parameter", + "name": "limit", + "type": { + "name": "int", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1016, + "kind": 32768, + "kindString": "Parameter", + "name": "exclusive_start_key", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "KeyValueStoreListKeysPage", + "type": "reference", + "target": "786" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "name": "override" + } + ], + "flags": {}, + "groups": [], + "id": 1017, + "module": "storage_clients._memory._key_value_store_client", + "name": "get_record", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_memory/_key_value_store_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 213 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 1018, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_record", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1019, + "kind": 32768, + "kindString": "Parameter", + "name": "key", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "KeyValueStoreRecord | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "KeyValueStoreRecord", + "target": "772" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "name": "override" + } + ], + "flags": {}, + "groups": [], + "id": 1020, + "module": "storage_clients._memory._key_value_store_client", + "name": "get_record_as_bytes", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_memory/_key_value_store_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 217 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 1021, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_record_as_bytes", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1022, + "kind": 32768, + "kindString": "Parameter", + "name": "key", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "KeyValueStoreRecord[bytes] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "KeyValueStoreRecord", + "typeArguments": [ + { + "type": "reference", + "name": "bytes", + "target": "3048" + } + ], + "target": "772" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "name": "override" + } + ], + "flags": {}, + "groups": [], + "id": 1023, + "module": "storage_clients._memory._key_value_store_client", + "name": "stream_record", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_memory/_key_value_store_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 221 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 1024, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "stream_record", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1025, + "kind": 32768, + "kindString": "Parameter", + "name": "key", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "AbstractAsyncContextManager", + "type": "reference", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "KeyValueStoreRecord", + "typeArguments": [ + { + "type": "reference", + "name": "Response" + } + ], + "target": "772" + }, + { + "type": "literal", + "value": null + } + ] + } + ] + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "name": "override" + } + ], + "flags": {}, + "groups": [], + "id": 1026, + "module": "storage_clients._memory._key_value_store_client", + "name": "set_record", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_memory/_key_value_store_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 225 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 1027, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "set_record", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1028, + "kind": 32768, + "kindString": "Parameter", + "name": "key", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1029, + "kind": 32768, + "kindString": "Parameter", + "name": "value", + "type": { + "name": "Any", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1030, + "kind": 32768, + "kindString": "Parameter", + "name": "content_type", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "name": "override" + } + ], + "flags": {}, + "groups": [], + "id": 1031, + "module": "storage_clients._memory._key_value_store_client", + "name": "delete_record", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_memory/_key_value_store_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 269 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 1032, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "delete_record", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1033, + "kind": 32768, + "kindString": "Parameter", + "name": "key", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "name": "override" + } + ], + "flags": {}, + "groups": [], + "id": 1034, + "module": "storage_clients._memory._key_value_store_client", + "name": "get_public_url", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_memory/_key_value_store_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 291 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 1035, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_public_url", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1036, + "kind": 32768, + "kindString": "Parameter", + "name": "key", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "str", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Persist the specified record to the key-value store." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1037, + "module": "storage_clients._memory._key_value_store_client", + "name": "persist_record", + "parsedDocstring": { + "text": "Persist the specified record to the key-value store." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_memory/_key_value_store_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 312 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Persist the specified record to the key-value store." + } + ] + }, + "flags": {}, + "id": 1038, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "persist_record", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1039, + "kind": 32768, + "kindString": "Parameter", + "name": "record", + "type": { + "name": "KeyValueStoreRecord", + "type": "reference", + "target": "772" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Delete the specified record from the key-value store." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1040, + "module": "storage_clients._memory._key_value_store_client", + "name": "delete_persisted_record", + "parsedDocstring": { + "text": "Delete the specified record from the key-value store." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_memory/_key_value_store_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 345 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Delete the specified record from the key-value store." + } + ] + }, + "flags": {}, + "id": 1041, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "delete_persisted_record", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1042, + "kind": 32768, + "kindString": "Parameter", + "name": "record", + "type": { + "name": "KeyValueStoreRecord", + "type": "reference", + "target": "772" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Update the timestamps of the key-value store." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1043, + "module": "storage_clients._memory._key_value_store_client", + "name": "update_timestamps", + "parsedDocstring": { + "text": "Update the timestamps of the key-value store." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_memory/_key_value_store_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 360 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Update the timestamps of the key-value store." + } + ] + }, + "flags": {}, + "id": 1044, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "update_timestamps", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 1045, + "kind": 32768, + "kindString": "Parameter", + "name": "has_been_modified", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Subclient for manipulating a single key-value store." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 996, + 1011, + 1040, + 1031, + 1006, + 1034, + 1017, + 1020, + 1013, + 1037, + 1026, + 1023, + 1008, + 1043 + ], + "title": "Methods" + }, + { + "children": [ + 1005, + 1004 + ], + "title": "Properties" + } + ], + "id": 995, + "module": "storage_clients._memory._key_value_store_client", + "name": "KeyValueStoreClient", + "parsedDocstring": { + "text": "Subclient for manipulating a single key-value store." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_memory/_key_value_store_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 38 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1047, + "module": "storage_clients._memory._dataset_collection_client", + "name": "__init__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_memory/_dataset_collection_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 20 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 1048, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 1049, + "kind": 32768, + "kindString": "Parameter", + "name": "memory_storage_client", + "type": { + "name": "MemoryStorageClient", + "type": "reference", + "target": "936" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "name": "override" + } + ], + "flags": {}, + "groups": [], + "id": 1050, + "module": "storage_clients._memory._dataset_collection_client", + "name": "get_or_create", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_memory/_dataset_collection_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 28 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 1051, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_or_create", + "parameters": [ + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1052, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1053, + "kind": 32768, + "kindString": "Parameter", + "name": "schema", + "type": { + "name": "dict | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "dict" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1054, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "DatasetMetadata", + "type": "reference", + "target": "757" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "name": "override" + } + ], + "flags": {}, + "groups": [], + "id": 1055, + "module": "storage_clients._memory._dataset_collection_client", + "name": "list", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_memory/_dataset_collection_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 45 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 1056, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "list", + "parameters": [ + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1057, + "kind": 32768, + "kindString": "Parameter", + "name": "unnamed", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1058, + "kind": 32768, + "kindString": "Parameter", + "name": "limit", + "type": { + "name": "int | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1059, + "kind": 32768, + "kindString": "Parameter", + "name": "offset", + "type": { + "name": "int | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1060, + "kind": 32768, + "kindString": "Parameter", + "name": "desc", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "DatasetListPage", + "type": "reference", + "target": "817" + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Subclient for manipulating datasets." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 1047, + 1050, + 1055 + ], + "title": "Methods" + } + ], + "id": 1046, + "module": "storage_clients._memory._dataset_collection_client", + "name": "DatasetCollectionClient", + "parsedDocstring": { + "text": "Subclient for manipulating datasets." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_memory/_dataset_collection_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 17 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1061, + "module": "storage_clients._memory._dataset_client", + "name": "logger", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_memory/_dataset_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 31 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1063, + "module": "storage_clients._memory._dataset_client", + "name": "__init__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_memory/_dataset_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 43 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 1064, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 1065, + "kind": 32768, + "kindString": "Parameter", + "name": "memory_storage_client", + "type": { + "name": "MemoryStorageClient", + "type": "reference", + "target": "936" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1066, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1067, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1068, + "kind": 32768, + "kindString": "Parameter", + "name": "created_at", + "type": { + "name": "datetime | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "datetime" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1069, + "kind": 32768, + "kindString": "Parameter", + "name": "accessed_at", + "type": { + "name": "datetime | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "datetime" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1070, + "kind": 32768, + "kindString": "Parameter", + "name": "modified_at", + "type": { + "name": "datetime | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "datetime" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "0", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1071, + "kind": 32768, + "kindString": "Parameter", + "name": "item_count", + "type": { + "name": "int", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the resource info for the dataset client." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 1072, + "module": "storage_clients._memory._dataset_client", + "name": "resource_info", + "parsedDocstring": { + "text": "Get the resource info for the dataset client." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_memory/_dataset_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 66 + } + ], + "type": { + "name": "DatasetMetadata", + "type": "reference", + "target": "757" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the resource directory for the client." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 1073, + "module": "storage_clients._memory._dataset_client", + "name": "resource_directory", + "parsedDocstring": { + "text": "Get the resource directory for the client." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_memory/_dataset_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 78 + } + ], + "type": { + "name": "str", + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "name": "override" + } + ], + "flags": {}, + "groups": [], + "id": 1074, + "module": "storage_clients._memory._dataset_client", + "name": "get", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_memory/_dataset_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 83 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 1075, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get", + "parameters": [], + "type": { + "name": "DatasetMetadata | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "DatasetMetadata", + "target": "757" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "name": "override" + } + ], + "flags": {}, + "groups": [], + "id": 1076, + "module": "storage_clients._memory._dataset_client", + "name": "update", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_memory/_dataset_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 99 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 1077, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "update", + "parameters": [ + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1078, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "DatasetMetadata", + "type": "reference", + "target": "757" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "name": "override" + } + ], + "flags": {}, + "groups": [], + "id": 1079, + "module": "storage_clients._memory._dataset_client", + "name": "delete", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_memory/_dataset_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 140 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 1080, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "delete", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "name": "override" + } + ], + "flags": {}, + "groups": [], + "id": 1081, + "module": "storage_clients._memory._dataset_client", + "name": "list_items", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_memory/_dataset_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 155 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 1082, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "list_items", + "parameters": [ + { + "defaultValue": "0", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1083, + "kind": 32768, + "kindString": "Parameter", + "name": "offset", + "type": { + "name": "int | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "_LIST_ITEMS_LIMIT", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1084, + "kind": 32768, + "kindString": "Parameter", + "name": "limit", + "type": { + "name": "int | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1085, + "kind": 32768, + "kindString": "Parameter", + "name": "clean", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1086, + "kind": 32768, + "kindString": "Parameter", + "name": "desc", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1087, + "kind": 32768, + "kindString": "Parameter", + "name": "fields", + "type": { + "name": "list[str] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ], + "target": "866" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1088, + "kind": 32768, + "kindString": "Parameter", + "name": "omit", + "type": { + "name": "list[str] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ], + "target": "866" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1089, + "kind": 32768, + "kindString": "Parameter", + "name": "unwind", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1090, + "kind": 32768, + "kindString": "Parameter", + "name": "skip_empty", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1091, + "kind": 32768, + "kindString": "Parameter", + "name": "skip_hidden", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1092, + "kind": 32768, + "kindString": "Parameter", + "name": "flatten", + "type": { + "name": "list[str] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ], + "target": "866" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1093, + "kind": 32768, + "kindString": "Parameter", + "name": "view", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "DatasetItemsListPage", + "type": "reference", + "target": "823" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "name": "override" + } + ], + "flags": {}, + "groups": [], + "id": 1094, + "module": "storage_clients._memory._dataset_client", + "name": "iterate_items", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_memory/_dataset_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 210 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 1095, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "iterate_items", + "parameters": [ + { + "defaultValue": "0", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1096, + "kind": 32768, + "kindString": "Parameter", + "name": "offset", + "type": { + "name": "int", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1097, + "kind": 32768, + "kindString": "Parameter", + "name": "limit", + "type": { + "name": "int | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1098, + "kind": 32768, + "kindString": "Parameter", + "name": "clean", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1099, + "kind": 32768, + "kindString": "Parameter", + "name": "desc", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1100, + "kind": 32768, + "kindString": "Parameter", + "name": "fields", + "type": { + "name": "list[str] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ], + "target": "866" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1101, + "kind": 32768, + "kindString": "Parameter", + "name": "omit", + "type": { + "name": "list[str] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ], + "target": "866" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1102, + "kind": 32768, + "kindString": "Parameter", + "name": "unwind", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1103, + "kind": 32768, + "kindString": "Parameter", + "name": "skip_empty", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1104, + "kind": 32768, + "kindString": "Parameter", + "name": "skip_hidden", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "AsyncIterator", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "dict" + } + ] + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "name": "override" + } + ], + "flags": {}, + "groups": [], + "id": 1105, + "module": "storage_clients._memory._dataset_client", + "name": "get_items_as_bytes", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_memory/_dataset_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 247 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 1106, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_items_as_bytes", + "parameters": [ + { + "defaultValue": "'json'", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1107, + "kind": 32768, + "kindString": "Parameter", + "name": "item_format", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1108, + "kind": 32768, + "kindString": "Parameter", + "name": "offset", + "type": { + "name": "int | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1109, + "kind": 32768, + "kindString": "Parameter", + "name": "limit", + "type": { + "name": "int | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1110, + "kind": 32768, + "kindString": "Parameter", + "name": "desc", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1111, + "kind": 32768, + "kindString": "Parameter", + "name": "clean", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1112, + "kind": 32768, + "kindString": "Parameter", + "name": "bom", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1113, + "kind": 32768, + "kindString": "Parameter", + "name": "delimiter", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1114, + "kind": 32768, + "kindString": "Parameter", + "name": "fields", + "type": { + "name": "list[str] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ], + "target": "866" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1115, + "kind": 32768, + "kindString": "Parameter", + "name": "omit", + "type": { + "name": "list[str] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ], + "target": "866" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1116, + "kind": 32768, + "kindString": "Parameter", + "name": "unwind", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1117, + "kind": 32768, + "kindString": "Parameter", + "name": "skip_empty", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1118, + "kind": 32768, + "kindString": "Parameter", + "name": "skip_header_row", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1119, + "kind": 32768, + "kindString": "Parameter", + "name": "skip_hidden", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1120, + "kind": 32768, + "kindString": "Parameter", + "name": "xml_root", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1121, + "kind": 32768, + "kindString": "Parameter", + "name": "xml_row", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1122, + "kind": 32768, + "kindString": "Parameter", + "name": "flatten", + "type": { + "name": "list[str] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ], + "target": "866" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "bytes", + "type": "reference", + "target": "3048" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "name": "override" + } + ], + "flags": {}, + "groups": [], + "id": 1123, + "module": "storage_clients._memory._dataset_client", + "name": "stream_items", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_memory/_dataset_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 270 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 1124, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "stream_items", + "parameters": [ + { + "defaultValue": "'json'", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1125, + "kind": 32768, + "kindString": "Parameter", + "name": "item_format", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1126, + "kind": 32768, + "kindString": "Parameter", + "name": "offset", + "type": { + "name": "int | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1127, + "kind": 32768, + "kindString": "Parameter", + "name": "limit", + "type": { + "name": "int | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1128, + "kind": 32768, + "kindString": "Parameter", + "name": "desc", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1129, + "kind": 32768, + "kindString": "Parameter", + "name": "clean", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1130, + "kind": 32768, + "kindString": "Parameter", + "name": "bom", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1131, + "kind": 32768, + "kindString": "Parameter", + "name": "delimiter", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1132, + "kind": 32768, + "kindString": "Parameter", + "name": "fields", + "type": { + "name": "list[str] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ], + "target": "866" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1133, + "kind": 32768, + "kindString": "Parameter", + "name": "omit", + "type": { + "name": "list[str] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ], + "target": "866" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1134, + "kind": 32768, + "kindString": "Parameter", + "name": "unwind", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1135, + "kind": 32768, + "kindString": "Parameter", + "name": "skip_empty", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1136, + "kind": 32768, + "kindString": "Parameter", + "name": "skip_header_row", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1137, + "kind": 32768, + "kindString": "Parameter", + "name": "skip_hidden", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1138, + "kind": 32768, + "kindString": "Parameter", + "name": "xml_root", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1139, + "kind": 32768, + "kindString": "Parameter", + "name": "xml_row", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "AbstractAsyncContextManager", + "type": "reference", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "Response" + }, + { + "type": "literal", + "value": null + } + ] + } + ] + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "name": "override" + } + ], + "flags": {}, + "groups": [], + "id": 1140, + "module": "storage_clients._memory._dataset_client", + "name": "push_items", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_memory/_dataset_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 292 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 1141, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "push_items", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1142, + "kind": 32768, + "kindString": "Parameter", + "name": "items", + "type": { + "name": "JsonSerializable", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Update the timestamps of the dataset." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1143, + "module": "storage_clients._memory._dataset_client", + "name": "update_timestamps", + "parsedDocstring": { + "text": "Update the timestamps of the dataset." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_memory/_dataset_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 364 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Update the timestamps of the dataset." + } + ] + }, + "flags": {}, + "id": 1144, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "update_timestamps", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 1145, + "kind": 32768, + "kindString": "Parameter", + "name": "has_been_modified", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Calculate the start and end indexes for listing items." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1146, + "module": "storage_clients._memory._dataset_client", + "name": "get_start_and_end_indexes", + "parsedDocstring": { + "text": "Calculate the start and end indexes for listing items." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_memory/_dataset_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 379 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Calculate the start and end indexes for listing items." + } + ] + }, + "flags": {}, + "id": 1147, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "get_start_and_end_indexes", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1148, + "kind": 32768, + "kindString": "Parameter", + "name": "offset", + "type": { + "name": "int", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1149, + "kind": 32768, + "kindString": "Parameter", + "name": "limit", + "type": { + "name": "int | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "tuple", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "reference", + "name": "int" + } + ] + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Subclient for manipulating a single dataset." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 1063, + 1079, + 1074, + 1105, + 1146, + 1094, + 1081, + 1140, + 1123, + 1076, + 1143 + ], + "title": "Methods" + }, + { + "children": [ + 1073, + 1072 + ], + "title": "Properties" + } + ], + "id": 1062, + "module": "storage_clients._memory._dataset_client", + "name": "DatasetClient", + "parsedDocstring": { + "text": "Subclient for manipulating a single dataset." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_memory/_dataset_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 34 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1150, + "module": "storage_clients._memory._creation_management", + "name": "logger", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_memory/_creation_management.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 30 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Update or writes metadata to a specified directory.\n\nThe function writes a given metadata dictionary to a JSON file within a specified directory.\nThe writing process is skipped if `write_metadata` is False. Before writing, it ensures that\nthe target directory exists, creating it if necessary.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1151, + "module": "storage_clients._memory._creation_management", + "name": "persist_metadata_if_enabled", + "parsedDocstring": { + "text": "Update or writes metadata to a specified directory.\n\nThe function writes a given metadata dictionary to a JSON file within a specified directory.\nThe writing process is skipped if `write_metadata` is False. Before writing, it ensures that\nthe target directory exists, creating it if necessary.\n", + "args": { + "data": "A dictionary containing metadata to be written.", + "entity_directory": "The directory path where the metadata file should be stored.", + "write_metadata": "A boolean flag indicating whether the metadata should be written to file." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_memory/_creation_management.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 33 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Update or writes metadata to a specified directory.\n\nThe function writes a given metadata dictionary to a JSON file within a specified directory.\nThe writing process is skipped if `write_metadata` is False. Before writing, it ensures that\nthe target directory exists, creating it if necessary.\n" + } + ] + }, + "flags": {}, + "id": 1152, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "persist_metadata_if_enabled", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A dictionary containing metadata to be written." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 1153, + "kind": 32768, + "kindString": "Parameter", + "name": "data", + "type": { + "name": "dict", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The directory path where the metadata file should be stored." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 1154, + "kind": 32768, + "kindString": "Parameter", + "name": "entity_directory", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A boolean flag indicating whether the metadata should be written to file." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 1155, + "kind": 32768, + "kindString": "Parameter", + "name": "write_metadata", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Locate or create a new storage client based on the given ID or name.\n\nThis method attempts to find a storage client in the memory cache first. If not found,\nit tries to locate a storage directory by name. If still not found, it searches through\nstorage directories for a matching ID or name in their metadata. If none exists, and the\nspecified ID is 'default', it checks for a default storage directory. If a storage client\nis found or created, it is added to the memory cache. If no storage client can be located or\ncreated, the method returns None.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1156, + "module": "storage_clients._memory._creation_management", + "name": "find_or_create_client_by_id_or_name_inner", + "parsedDocstring": { + "text": "Locate or create a new storage client based on the given ID or name.\n\nThis method attempts to find a storage client in the memory cache first. If not found,\nit tries to locate a storage directory by name. If still not found, it searches through\nstorage directories for a matching ID or name in their metadata. If none exists, and the\nspecified ID is 'default', it checks for a default storage directory. If a storage client\nis found or created, it is added to the memory cache. If no storage client can be located or\ncreated, the method returns None.\n", + "args": { + "resource_client_class": "The class of the resource client.", + "memory_storage_client": "The memory storage client used to store and retrieve storage clients.", + "id": "The unique identifier for the storage client.", + "name": "The name of the storage client.\n" + }, + "returns": "The found or created storage client, or None if no client could be found or created." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_memory/_creation_management.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 62 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "The found or created storage client, or None if no client could be found or created." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Locate or create a new storage client based on the given ID or name.\n\nThis method attempts to find a storage client in the memory cache first. If not found,\nit tries to locate a storage directory by name. If still not found, it searches through\nstorage directories for a matching ID or name in their metadata. If none exists, and the\nspecified ID is 'default', it checks for a default storage directory. If a storage client\nis found or created, it is added to the memory cache. If no storage client can be located or\ncreated, the method returns None.\n" + } + ] + }, + "flags": {}, + "id": 1157, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "find_or_create_client_by_id_or_name_inner", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The class of the resource client." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1158, + "kind": 32768, + "kindString": "Parameter", + "name": "resource_client_class", + "type": { + "name": "type", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "TResourceClient", + "target": "934" + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The memory storage client used to store and retrieve storage clients." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1159, + "kind": 32768, + "kindString": "Parameter", + "name": "memory_storage_client", + "type": { + "name": "MemoryStorageClient", + "type": "reference", + "target": "936" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The unique identifier for the storage client." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1160, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The name of the storage client.\n" + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1161, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "TResourceClient | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "TResourceClient", + "target": "934" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Retrieve a named storage, or create a new one when it doesn't exist.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1162, + "module": "storage_clients._memory._creation_management", + "name": "get_or_create_inner", + "parsedDocstring": { + "text": "Retrieve a named storage, or create a new one when it doesn't exist.\n", + "args": { + "memory_storage_client": "The memory storage client.", + "storage_client_cache": "The cache of storage clients.", + "resource_client_class": "The class of the storage to retrieve or create.", + "name": "The name of the storage to retrieve or create.", + "id": "ID of the storage to retrieve or create.\n" + }, + "returns": "The retrieved or newly-created storage." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_memory/_creation_management.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 121 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "The retrieved or newly-created storage." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Retrieve a named storage, or create a new one when it doesn't exist.\n" + } + ] + }, + "flags": {}, + "id": 1163, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_or_create_inner", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The memory storage client." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 1164, + "kind": 32768, + "kindString": "Parameter", + "name": "memory_storage_client", + "type": { + "name": "MemoryStorageClient", + "type": "reference", + "target": "936" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The cache of storage clients." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 1165, + "kind": 32768, + "kindString": "Parameter", + "name": "storage_client_cache", + "type": { + "name": "list", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "TResourceClient", + "target": "934" + } + ], + "target": "866" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The class of the storage to retrieve or create." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 1166, + "kind": 32768, + "kindString": "Parameter", + "name": "resource_client_class", + "type": { + "name": "type", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "TResourceClient", + "target": "934" + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The name of the storage to retrieve or create." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1167, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "ID of the storage to retrieve or create.\n" + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1168, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "TResourceClient", + "type": "reference", + "target": "934" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1169, + "module": "storage_clients._memory._creation_management", + "name": "create_dataset_from_directory", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_memory/_creation_management.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 171 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 1170, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "create_dataset_from_directory", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1171, + "kind": 32768, + "kindString": "Parameter", + "name": "storage_directory", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1172, + "kind": 32768, + "kindString": "Parameter", + "name": "memory_storage_client", + "type": { + "name": "MemoryStorageClient", + "type": "reference", + "target": "936" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1173, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1174, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "DatasetClient", + "type": "reference", + "target": "1062" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1175, + "module": "storage_clients._memory._creation_management", + "name": "create_kvs_from_directory", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_memory/_creation_management.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 234 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 1176, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "create_kvs_from_directory", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1177, + "kind": 32768, + "kindString": "Parameter", + "name": "storage_directory", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1178, + "kind": 32768, + "kindString": "Parameter", + "name": "memory_storage_client", + "type": { + "name": "MemoryStorageClient", + "type": "reference", + "target": "936" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1179, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1180, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "KeyValueStoreClient", + "type": "reference", + "target": "995" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1181, + "module": "storage_clients._memory._creation_management", + "name": "create_rq_from_directory", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_memory/_creation_management.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 329 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 1182, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "create_rq_from_directory", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1183, + "kind": 32768, + "kindString": "Parameter", + "name": "storage_directory", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1184, + "kind": 32768, + "kindString": "Parameter", + "name": "memory_storage_client", + "type": { + "name": "MemoryStorageClient", + "type": "reference", + "target": "936" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1185, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1186, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "RequestQueueClient", + "type": "reference", + "target": "873" + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1187, + "module": "storage_clients._base._types", + "name": "ResourceClient", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_base/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 12 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1188, + "module": "storage_clients._base._types", + "name": "ResourceCollectionClient", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_base/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 18 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get a subclient for a specific dataset by its ID." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1190, + "module": "storage_clients._base._storage_client", + "name": "dataset", + "parsedDocstring": { + "text": "Get a subclient for a specific dataset by its ID." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_base/_storage_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 28 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get a subclient for a specific dataset by its ID." + } + ] + }, + "flags": {}, + "id": 1191, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "dataset", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1192, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "DatasetClient", + "type": "reference", + "target": "1062" + }, + "overwrites": { + "name": "StorageClient.dataset", + "target": 1190, + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get a subclient for dataset collection operations." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1193, + "module": "storage_clients._base._storage_client", + "name": "datasets", + "parsedDocstring": { + "text": "Get a subclient for dataset collection operations." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_base/_storage_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 32 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get a subclient for dataset collection operations." + } + ] + }, + "flags": {}, + "id": 1194, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "datasets", + "parameters": [], + "type": { + "name": "DatasetCollectionClient", + "type": "reference", + "target": "1046" + }, + "overwrites": { + "name": "StorageClient.datasets", + "target": 1193, + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get a subclient for a specific key-value store by its ID." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1195, + "module": "storage_clients._base._storage_client", + "name": "key_value_store", + "parsedDocstring": { + "text": "Get a subclient for a specific key-value store by its ID." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_base/_storage_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 36 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get a subclient for a specific key-value store by its ID." + } + ] + }, + "flags": {}, + "id": 1196, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "key_value_store", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1197, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "KeyValueStoreClient", + "type": "reference", + "target": "995" + }, + "overwrites": { + "name": "StorageClient.key_value_store", + "target": 1195, + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get a subclient for key-value store collection operations." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1198, + "module": "storage_clients._base._storage_client", + "name": "key_value_stores", + "parsedDocstring": { + "text": "Get a subclient for key-value store collection operations." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_base/_storage_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 40 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get a subclient for key-value store collection operations." + } + ] + }, + "flags": {}, + "id": 1199, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "key_value_stores", + "parameters": [], + "type": { + "name": "KeyValueStoreCollectionClient", + "type": "reference", + "target": "979" + }, + "overwrites": { + "name": "StorageClient.key_value_stores", + "target": 1198, + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get a subclient for a specific request queue by its ID." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1200, + "module": "storage_clients._base._storage_client", + "name": "request_queue", + "parsedDocstring": { + "text": "Get a subclient for a specific request queue by its ID." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_base/_storage_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 44 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get a subclient for a specific request queue by its ID." + } + ] + }, + "flags": {}, + "id": 1201, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "request_queue", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1202, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "RequestQueueClient", + "type": "reference", + "target": "873" + }, + "overwrites": { + "name": "StorageClient.request_queue", + "target": 1200, + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get a subclient for request queue collection operations." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1203, + "module": "storage_clients._base._storage_client", + "name": "request_queues", + "parsedDocstring": { + "text": "Get a subclient for request queue collection operations." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_base/_storage_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 48 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get a subclient for request queue collection operations." + } + ] + }, + "flags": {}, + "id": 1204, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "request_queues", + "parameters": [], + "type": { + "name": "RequestQueueCollectionClient", + "type": "reference", + "target": "857" + }, + "overwrites": { + "name": "StorageClient.request_queues", + "target": 1203, + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Perform a purge of the default storages.\n\nThis method ensures that the purge is executed only once during the lifetime of the instance.\nIt is primarily used to clean up residual data from previous runs to maintain a clean state.\nIf the storage client does not support purging, leave it empty." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1205, + "module": "storage_clients._base._storage_client", + "name": "purge_on_start", + "parsedDocstring": { + "text": "Perform a purge of the default storages.\n\nThis method ensures that the purge is executed only once during the lifetime of the instance.\nIt is primarily used to clean up residual data from previous runs to maintain a clean state.\nIf the storage client does not support purging, leave it empty." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_base/_storage_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 52 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Perform a purge of the default storages.\n\nThis method ensures that the purge is executed only once during the lifetime of the instance.\nIt is primarily used to clean up residual data from previous runs to maintain a clean state.\nIf the storage client does not support purging, leave it empty." + } + ] + }, + "flags": {}, + "id": 1206, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "purge_on_start", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "StorageClient.purge_on_start", + "target": 1205, + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return statistics about rate limit errors encountered by the HTTP client in storage client." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1207, + "module": "storage_clients._base._storage_client", + "name": "get_rate_limit_errors", + "parsedDocstring": { + "text": "Return statistics about rate limit errors encountered by the HTTP client in storage client." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_base/_storage_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 60 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return statistics about rate limit errors encountered by the HTTP client in storage client." + } + ] + }, + "flags": {}, + "id": 1208, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "get_rate_limit_errors", + "parameters": [], + "type": { + "name": "dict", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "reference", + "name": "int" + } + ] + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Defines an abstract base for storage clients.\n\nIt offers interfaces to get subclients for interacting with storage resources like datasets, key-value stores,\nand request queues." + } + ] + }, + "decorations": [ + { + "args": "('Abstract classes')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 1190, + 1193, + 1207, + 1195, + 1198, + 1205, + 1200, + 1203 + ], + "title": "Methods" + } + ], + "id": 1189, + "module": "storage_clients._base._storage_client", + "name": "StorageClient", + "parsedDocstring": { + "text": "Defines an abstract base for storage clients.\n\nIt offers interfaces to get subclients for interacting with storage resources like datasets, key-value stores,\nand request queues." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_base/_storage_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 20 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedBy": [ + { + "name": "MemoryStorageClient", + "target": "936", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Retrieve an existing request queue by its name or ID, or create a new one if it does not exist.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1210, + "module": "storage_clients._base._request_queue_collection_client", + "name": "get_or_create", + "parsedDocstring": { + "text": "Retrieve an existing request queue by its name or ID, or create a new one if it does not exist.\n", + "args": { + "id": "Optional ID of the request queue to retrieve or create. If provided, the method will attempt\nto find a request queue with the ID.", + "name": "Optional name of the request queue resource to retrieve or create. If provided, the method will\nattempt to find a request queue with this name.", + "schema": "Optional schema for the request queue resource to be created.\n" + }, + "returns": "Metadata object containing the information of the retrieved or created request queue." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_base/_request_queue_collection_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 20 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "Metadata object containing the information of the retrieved or created request queue." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Retrieve an existing request queue by its name or ID, or create a new one if it does not exist.\n" + } + ] + }, + "flags": {}, + "id": 1211, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_or_create", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Optional ID of the request queue to retrieve or create. If provided, the method will attempt\nto find a request queue with the ID." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1212, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Optional name of the request queue resource to retrieve or create. If provided, the method will\nattempt to find a request queue with this name." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1213, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Optional schema for the request queue resource to be created.\n" + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1214, + "kind": 32768, + "kindString": "Parameter", + "name": "schema", + "type": { + "name": "dict | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "dict" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "RequestQueueMetadata", + "type": "reference", + "target": "763" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "List the available request queues.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1215, + "module": "storage_clients._base._request_queue_collection_client", + "name": "list", + "parsedDocstring": { + "text": "List the available request queues.\n", + "args": { + "unnamed": "Whether to list only the unnamed request queues.", + "limit": "Maximum number of request queues to return.", + "offset": "Number of request queues to skip from the beginning of the list.", + "desc": "Whether to sort the request queues in descending order.\n" + }, + "returns": "The list of available request queues matching the specified filters." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_base/_request_queue_collection_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 41 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "The list of available request queues matching the specified filters." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "List the available request queues.\n" + } + ] + }, + "flags": {}, + "id": 1216, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "list", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Whether to list only the unnamed request queues." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1217, + "kind": 32768, + "kindString": "Parameter", + "name": "unnamed", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum number of request queues to return." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1218, + "kind": 32768, + "kindString": "Parameter", + "name": "limit", + "type": { + "name": "int | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Number of request queues to skip from the beginning of the list." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1219, + "kind": 32768, + "kindString": "Parameter", + "name": "offset", + "type": { + "name": "int | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Whether to sort the request queues in descending order.\n" + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1220, + "kind": 32768, + "kindString": "Parameter", + "name": "desc", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "RequestQueueListPage", + "type": "reference", + "target": "821" + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "An abstract class for request queue collection clients.\n\nThis collection client handles operations that involve multiple instances of a given resource type." + } + ] + }, + "decorations": [ + { + "args": "('Abstract classes')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 1210, + 1215 + ], + "title": "Methods" + } + ], + "id": 1209, + "module": "storage_clients._base._request_queue_collection_client", + "name": "RequestQueueCollectionClient", + "parsedDocstring": { + "text": "An abstract class for request queue collection clients.\n\nThis collection client handles operations that involve multiple instances of a given resource type." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_base/_request_queue_collection_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 13 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get metadata about the request queue being managed by this client.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1222, + "module": "storage_clients._base._request_queue_client", + "name": "get", + "parsedDocstring": { + "text": "Get metadata about the request queue being managed by this client.\n", + "returns": "An object containing the request queue's details, or None if the request queue does not exist." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_base/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 31 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "An object containing the request queue's details, or None if the request queue does not exist." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Get metadata about the request queue being managed by this client.\n" + } + ] + }, + "flags": {}, + "id": 1223, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get", + "parameters": [], + "type": { + "name": "RequestQueueMetadata | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "RequestQueueMetadata", + "target": "763" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Update the request queue metadata.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1224, + "module": "storage_clients._base._request_queue_client", + "name": "update", + "parsedDocstring": { + "text": "Update the request queue metadata.\n", + "args": { + "name": "New new name for the request queue.\n" + }, + "returns": "An object reflecting the updated request queue metadata." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_base/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 39 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "An object reflecting the updated request queue metadata." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Update the request queue metadata.\n" + } + ] + }, + "flags": {}, + "id": 1225, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "update", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "New new name for the request queue.\n" + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1226, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "RequestQueueMetadata", + "type": "reference", + "target": "763" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Permanently delete the request queue managed by this client." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1227, + "module": "storage_clients._base._request_queue_client", + "name": "delete", + "parsedDocstring": { + "text": "Permanently delete the request queue managed by this client." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_base/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 54 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Permanently delete the request queue managed by this client." + } + ] + }, + "flags": {}, + "id": 1228, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "delete", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Retrieve a given number of requests from the beginning of the queue.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1229, + "module": "storage_clients._base._request_queue_client", + "name": "list_head", + "parsedDocstring": { + "text": "Retrieve a given number of requests from the beginning of the queue.\n", + "args": { + "limit": "How many requests to retrieve.\n" + }, + "returns": "The desired number of requests from the beginning of the queue." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_base/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 58 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "The desired number of requests from the beginning of the queue." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Retrieve a given number of requests from the beginning of the queue.\n" + } + ] + }, + "flags": {}, + "id": 1230, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "list_head", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "How many requests to retrieve.\n" + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1231, + "kind": 32768, + "kindString": "Parameter", + "name": "limit", + "type": { + "name": "int | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "RequestQueueHead", + "type": "reference", + "target": "801" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Fetch and lock a specified number of requests from the start of the queue.\n\nRetrieve and locks the first few requests of a queue for the specified duration. This prevents the requests\nfrom being fetched by another client until the lock expires.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1232, + "module": "storage_clients._base._request_queue_client", + "name": "list_and_lock_head", + "parsedDocstring": { + "text": "Fetch and lock a specified number of requests from the start of the queue.\n\nRetrieve and locks the first few requests of a queue for the specified duration. This prevents the requests\nfrom being fetched by another client until the lock expires.\n", + "args": { + "lock_secs": "Duration for which the requests are locked, in seconds.", + "limit": "Maximum number of requests to retrieve and lock.\n" + }, + "returns": "The desired number of locked requests from the beginning of the queue." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_base/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 69 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "The desired number of locked requests from the beginning of the queue." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Fetch and lock a specified number of requests from the start of the queue.\n\nRetrieve and locks the first few requests of a queue for the specified duration. This prevents the requests\nfrom being fetched by another client until the lock expires.\n" + } + ] + }, + "flags": {}, + "id": 1233, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "list_and_lock_head", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Duration for which the requests are locked, in seconds." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 1234, + "kind": 32768, + "kindString": "Parameter", + "name": "lock_secs", + "type": { + "name": "int", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum number of requests to retrieve and lock.\n" + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1235, + "kind": 32768, + "kindString": "Parameter", + "name": "limit", + "type": { + "name": "int | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "RequestQueueHeadWithLocks", + "type": "reference", + "target": "807" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Add a request to the queue.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1236, + "module": "storage_clients._base._request_queue_client", + "name": "add_request", + "parsedDocstring": { + "text": "Add a request to the queue.\n", + "args": { + "request": "The request to add to the queue.", + "forefront": "Whether to add the request to the head or the end of the queue.\n" + }, + "returns": "Request queue operation information." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_base/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 84 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "Request queue operation information." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Add a request to the queue.\n" + } + ] + }, + "flags": {}, + "id": 1237, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "add_request", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The request to add to the queue." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1238, + "kind": 32768, + "kindString": "Parameter", + "name": "request", + "type": { + "name": "Request", + "type": "reference", + "target": "398" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Whether to add the request to the head or the end of the queue.\n" + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1239, + "kind": 32768, + "kindString": "Parameter", + "name": "forefront", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "ProcessedRequest", + "type": "reference", + "target": "828" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Add a batch of requests to the queue.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1240, + "module": "storage_clients._base._request_queue_client", + "name": "batch_add_requests", + "parsedDocstring": { + "text": "Add a batch of requests to the queue.\n", + "args": { + "requests": "The requests to add to the queue.", + "forefront": "Whether to add the requests to the head or the end of the queue.\n" + }, + "returns": "Request queue batch operation information." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_base/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 101 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "Request queue batch operation information." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Add a batch of requests to the queue.\n" + } + ] + }, + "flags": {}, + "id": 1241, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "batch_add_requests", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The requests to add to the queue." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1242, + "kind": 32768, + "kindString": "Parameter", + "name": "requests", + "type": { + "name": "Sequence", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Request", + "target": "398" + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Whether to add the requests to the head or the end of the queue.\n" + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1243, + "kind": 32768, + "kindString": "Parameter", + "name": "forefront", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "BatchRequestsOperationResponse", + "type": "reference", + "target": "839" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Retrieve a request from the queue.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1244, + "module": "storage_clients._base._request_queue_client", + "name": "get_request", + "parsedDocstring": { + "text": "Retrieve a request from the queue.\n", + "args": { + "request_id": "ID of the request to retrieve.\n" + }, + "returns": "The retrieved request, or None, if it did not exist." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_base/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 118 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "The retrieved request, or None, if it did not exist." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Retrieve a request from the queue.\n" + } + ] + }, + "flags": {}, + "id": 1245, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_request", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "ID of the request to retrieve.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1246, + "kind": 32768, + "kindString": "Parameter", + "name": "request_id", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "Request | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Request", + "target": "398" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Update a request in the queue.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1247, + "module": "storage_clients._base._request_queue_client", + "name": "update_request", + "parsedDocstring": { + "text": "Update a request in the queue.\n", + "args": { + "request": "The updated request.", + "forefront": "Whether to put the updated request in the beginning or the end of the queue.\n" + }, + "returns": "The updated request" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_base/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 129 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "The updated request" + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Update a request in the queue.\n" + } + ] + }, + "flags": {}, + "id": 1248, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "update_request", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The updated request." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1249, + "kind": 32768, + "kindString": "Parameter", + "name": "request", + "type": { + "name": "Request", + "type": "reference", + "target": "398" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Whether to put the updated request in the beginning or the end of the queue.\n" + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1250, + "kind": 32768, + "kindString": "Parameter", + "name": "forefront", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "ProcessedRequest", + "type": "reference", + "target": "828" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Delete a request from the queue.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1251, + "module": "storage_clients._base._request_queue_client", + "name": "delete_request", + "parsedDocstring": { + "text": "Delete a request from the queue.\n", + "args": { + "request_id": "ID of the request to delete." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_base/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 146 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Delete a request from the queue.\n" + } + ] + }, + "flags": {}, + "id": 1252, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "delete_request", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "ID of the request to delete." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1253, + "kind": 32768, + "kindString": "Parameter", + "name": "request_id", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Prolong the lock on a specific request in the queue.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1254, + "module": "storage_clients._base._request_queue_client", + "name": "prolong_request_lock", + "parsedDocstring": { + "text": "Prolong the lock on a specific request in the queue.\n", + "args": { + "request_id": "The identifier of the request whose lock is to be prolonged.", + "forefront": "Whether to put the request in the beginning or the end of the queue after lock expires.", + "lock_secs": "The additional amount of time, in seconds, that the request will remain locked." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_base/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 154 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Prolong the lock on a specific request in the queue.\n" + } + ] + }, + "flags": {}, + "id": 1255, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "prolong_request_lock", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The identifier of the request whose lock is to be prolonged." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1256, + "kind": 32768, + "kindString": "Parameter", + "name": "request_id", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Whether to put the request in the beginning or the end of the queue after lock expires." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1257, + "kind": 32768, + "kindString": "Parameter", + "name": "forefront", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The additional amount of time, in seconds, that the request will remain locked." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 1258, + "kind": 32768, + "kindString": "Parameter", + "name": "lock_secs", + "type": { + "name": "int", + "type": "reference" + } + } + ], + "type": { + "name": "ProlongRequestLockResponse", + "type": "reference", + "target": "825" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Delete the lock on a specific request in the queue.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1259, + "module": "storage_clients._base._request_queue_client", + "name": "delete_request_lock", + "parsedDocstring": { + "text": "Delete the lock on a specific request in the queue.\n", + "args": { + "request_id": "ID of the request to delete the lock.", + "forefront": "Whether to put the request in the beginning or the end of the queue after the lock is deleted." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_base/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 170 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Delete the lock on a specific request in the queue.\n" + } + ] + }, + "flags": {}, + "id": 1260, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "delete_request_lock", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "ID of the request to delete the lock." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1261, + "kind": 32768, + "kindString": "Parameter", + "name": "request_id", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Whether to put the request in the beginning or the end of the queue after the lock is deleted." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1262, + "kind": 32768, + "kindString": "Parameter", + "name": "forefront", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Delete given requests from the queue.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1263, + "module": "storage_clients._base._request_queue_client", + "name": "batch_delete_requests", + "parsedDocstring": { + "text": "Delete given requests from the queue.\n", + "args": { + "requests": "The requests to delete from the queue." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_base/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 184 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Delete given requests from the queue.\n" + } + ] + }, + "flags": {}, + "id": 1264, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "batch_delete_requests", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The requests to delete from the queue." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1265, + "kind": 32768, + "kindString": "Parameter", + "name": "requests", + "type": { + "name": "list", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Request", + "target": "398" + } + ], + "target": "866" + } + } + ], + "type": { + "name": "BatchRequestsOperationResponse", + "type": "reference", + "target": "839" + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "An abstract class for request queue resource clients.\n\nThese clients are specific to the type of resource they manage and operate under a designated storage\nclient, like a memory storage client." + } + ] + }, + "decorations": [ + { + "args": "('Abstract classes')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 1236, + 1240, + 1263, + 1227, + 1251, + 1259, + 1222, + 1244, + 1232, + 1229, + 1254, + 1224, + 1247 + ], + "title": "Methods" + } + ], + "id": 1221, + "module": "storage_clients._base._request_queue_client", + "name": "RequestQueueClient", + "parsedDocstring": { + "text": "An abstract class for request queue resource clients.\n\nThese clients are specific to the type of resource they manage and operate under a designated storage\nclient, like a memory storage client." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_base/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 23 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Retrieve an existing key-value store by its name or ID, or create a new one if it does not exist.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1267, + "module": "storage_clients._base._key_value_store_collection_client", + "name": "get_or_create", + "parsedDocstring": { + "text": "Retrieve an existing key-value store by its name or ID, or create a new one if it does not exist.\n", + "args": { + "id": "Optional ID of the key-value store to retrieve or create. If provided, the method will attempt\nto find a key-value store with the ID.", + "name": "Optional name of the key-value store resource to retrieve or create. If provided, the method will\nattempt to find a key-value store with this name.", + "schema": "Optional schema for the key-value store resource to be created.\n" + }, + "returns": "Metadata object containing the information of the retrieved or created key-value store." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_base/_key_value_store_collection_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 20 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "Metadata object containing the information of the retrieved or created key-value store." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Retrieve an existing key-value store by its name or ID, or create a new one if it does not exist.\n" + } + ] + }, + "flags": {}, + "id": 1268, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_or_create", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Optional ID of the key-value store to retrieve or create. If provided, the method will attempt\nto find a key-value store with the ID." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1269, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Optional name of the key-value store resource to retrieve or create. If provided, the method will\nattempt to find a key-value store with this name." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1270, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Optional schema for the key-value store resource to be created.\n" + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1271, + "kind": 32768, + "kindString": "Parameter", + "name": "schema", + "type": { + "name": "dict | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "dict" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "KeyValueStoreMetadata", + "type": "reference", + "target": "760" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "List the available key-value stores.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1272, + "module": "storage_clients._base._key_value_store_collection_client", + "name": "list", + "parsedDocstring": { + "text": "List the available key-value stores.\n", + "args": { + "unnamed": "Whether to list only the unnamed key-value stores.", + "limit": "Maximum number of key-value stores to return.", + "offset": "Number of key-value stores to skip from the beginning of the list.", + "desc": "Whether to sort the key-value stores in descending order.\n" + }, + "returns": "The list of available key-value stores matching the specified filters." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_base/_key_value_store_collection_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 41 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "The list of available key-value stores matching the specified filters." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "List the available key-value stores.\n" + } + ] + }, + "flags": {}, + "id": 1273, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "list", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Whether to list only the unnamed key-value stores." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1274, + "kind": 32768, + "kindString": "Parameter", + "name": "unnamed", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum number of key-value stores to return." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1275, + "kind": 32768, + "kindString": "Parameter", + "name": "limit", + "type": { + "name": "int | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Number of key-value stores to skip from the beginning of the list." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1276, + "kind": 32768, + "kindString": "Parameter", + "name": "offset", + "type": { + "name": "int | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Whether to sort the key-value stores in descending order.\n" + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1277, + "kind": 32768, + "kindString": "Parameter", + "name": "desc", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "KeyValueStoreListPage", + "type": "reference", + "target": "819" + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "An abstract class for key-value store collection clients.\n\nThis collection client handles operations that involve multiple instances of a given resource type." + } + ] + }, + "decorations": [ + { + "args": "('Abstract classes')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 1267, + 1272 + ], + "title": "Methods" + } + ], + "id": 1266, + "module": "storage_clients._base._key_value_store_collection_client", + "name": "KeyValueStoreCollectionClient", + "parsedDocstring": { + "text": "An abstract class for key-value store collection clients.\n\nThis collection client handles operations that involve multiple instances of a given resource type." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_base/_key_value_store_collection_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 13 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get metadata about the key-value store being managed by this client.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1279, + "module": "storage_clients._base._key_value_store_client", + "name": "get", + "parsedDocstring": { + "text": "Get metadata about the key-value store being managed by this client.\n", + "returns": "An object containing the key-value store's details, or None if the key-value store does not exist." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_base/_key_value_store_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 25 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "An object containing the key-value store's details, or None if the key-value store does not exist." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Get metadata about the key-value store being managed by this client.\n" + } + ] + }, + "flags": {}, + "id": 1280, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get", + "parameters": [], + "type": { + "name": "KeyValueStoreMetadata | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "KeyValueStoreMetadata", + "target": "760" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Update the key-value store metadata.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1281, + "module": "storage_clients._base._key_value_store_client", + "name": "update", + "parsedDocstring": { + "text": "Update the key-value store metadata.\n", + "args": { + "name": "New new name for the key-value store.\n" + }, + "returns": "An object reflecting the updated key-value store metadata." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_base/_key_value_store_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 33 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "An object reflecting the updated key-value store metadata." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Update the key-value store metadata.\n" + } + ] + }, + "flags": {}, + "id": 1282, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "update", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "New new name for the key-value store.\n" + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1283, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "KeyValueStoreMetadata", + "type": "reference", + "target": "760" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Permanently delete the key-value store managed by this client." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1284, + "module": "storage_clients._base._key_value_store_client", + "name": "delete", + "parsedDocstring": { + "text": "Permanently delete the key-value store managed by this client." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_base/_key_value_store_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 48 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Permanently delete the key-value store managed by this client." + } + ] + }, + "flags": {}, + "id": 1285, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "delete", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "List the keys in the key-value store.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1286, + "module": "storage_clients._base._key_value_store_client", + "name": "list_keys", + "parsedDocstring": { + "text": "List the keys in the key-value store.\n", + "args": { + "limit": "Number of keys to be returned. Maximum value is 1000.", + "exclusive_start_key": "All keys up to this one (including) are skipped from the result.\n" + }, + "returns": "The list of keys in the key-value store matching the given arguments." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_base/_key_value_store_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 52 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "The list of keys in the key-value store matching the given arguments." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "List the keys in the key-value store.\n" + } + ] + }, + "flags": {}, + "id": 1287, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "list_keys", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Number of keys to be returned. Maximum value is 1000." + } + ] + }, + "defaultValue": "1000", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1288, + "kind": 32768, + "kindString": "Parameter", + "name": "limit", + "type": { + "name": "int", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "All keys up to this one (including) are skipped from the result.\n" + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1289, + "kind": 32768, + "kindString": "Parameter", + "name": "exclusive_start_key", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "KeyValueStoreListKeysPage", + "type": "reference", + "target": "786" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Retrieve the given record from the key-value store.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1290, + "module": "storage_clients._base._key_value_store_client", + "name": "get_record", + "parsedDocstring": { + "text": "Retrieve the given record from the key-value store.\n", + "args": { + "key": "Key of the record to retrieve.\n" + }, + "returns": "The requested record, or None, if the record does not exist" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_base/_key_value_store_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 69 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "The requested record, or None, if the record does not exist" + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Retrieve the given record from the key-value store.\n" + } + ] + }, + "flags": {}, + "id": 1291, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_record", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Key of the record to retrieve.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1292, + "kind": 32768, + "kindString": "Parameter", + "name": "key", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "KeyValueStoreRecord | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "KeyValueStoreRecord", + "target": "772" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Retrieve the given record from the key-value store, without parsing it.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1293, + "module": "storage_clients._base._key_value_store_client", + "name": "get_record_as_bytes", + "parsedDocstring": { + "text": "Retrieve the given record from the key-value store, without parsing it.\n", + "args": { + "key": "Key of the record to retrieve.\n" + }, + "returns": "The requested record, or None, if the record does not exist" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_base/_key_value_store_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 80 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "The requested record, or None, if the record does not exist" + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Retrieve the given record from the key-value store, without parsing it.\n" + } + ] + }, + "flags": {}, + "id": 1294, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_record_as_bytes", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Key of the record to retrieve.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1295, + "kind": 32768, + "kindString": "Parameter", + "name": "key", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "KeyValueStoreRecord[bytes] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "KeyValueStoreRecord", + "typeArguments": [ + { + "type": "reference", + "name": "bytes", + "target": "3048" + } + ], + "target": "772" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Retrieve the given record from the key-value store, as a stream.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1296, + "module": "storage_clients._base._key_value_store_client", + "name": "stream_record", + "parsedDocstring": { + "text": "Retrieve the given record from the key-value store, as a stream.\n", + "args": { + "key": "Key of the record to retrieve.\n" + }, + "returns": "The requested record as a context-managed streaming Response, or None, if the record does not exist" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_base/_key_value_store_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 91 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "The requested record as a context-managed streaming Response, or None, if the record does not exist" + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Retrieve the given record from the key-value store, as a stream.\n" + } + ] + }, + "flags": {}, + "id": 1297, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "stream_record", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Key of the record to retrieve.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1298, + "kind": 32768, + "kindString": "Parameter", + "name": "key", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "AbstractAsyncContextManager", + "type": "reference", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "KeyValueStoreRecord", + "typeArguments": [ + { + "type": "reference", + "name": "Response" + } + ], + "target": "772" + }, + { + "type": "literal", + "value": null + } + ] + } + ] + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Set a value to the given record in the key-value store.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1299, + "module": "storage_clients._base._key_value_store_client", + "name": "set_record", + "parsedDocstring": { + "text": "Set a value to the given record in the key-value store.\n", + "args": { + "key": "The key of the record to save the value to.", + "value": "The value to save into the record.", + "content_type": "The content type of the saved value." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_base/_key_value_store_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 102 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Set a value to the given record in the key-value store.\n" + } + ] + }, + "flags": {}, + "id": 1300, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "set_record", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The key of the record to save the value to." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1301, + "kind": 32768, + "kindString": "Parameter", + "name": "key", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The value to save into the record." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1302, + "kind": 32768, + "kindString": "Parameter", + "name": "value", + "type": { + "name": "Any", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The content type of the saved value." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1303, + "kind": 32768, + "kindString": "Parameter", + "name": "content_type", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Delete the specified record from the key-value store.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1304, + "module": "storage_clients._base._key_value_store_client", + "name": "delete_record", + "parsedDocstring": { + "text": "Delete the specified record from the key-value store.\n", + "args": { + "key": "The key of the record which to delete." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_base/_key_value_store_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 112 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Delete the specified record from the key-value store.\n" + } + ] + }, + "flags": {}, + "id": 1305, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "delete_record", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The key of the record which to delete." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1306, + "kind": 32768, + "kindString": "Parameter", + "name": "key", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the public URL for the given key.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1307, + "module": "storage_clients._base._key_value_store_client", + "name": "get_public_url", + "parsedDocstring": { + "text": "Get the public URL for the given key.\n", + "args": { + "key": "Key of the record for which URL is required.\n" + }, + "returns": "The public URL for the given key." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_base/_key_value_store_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 120 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "The public URL for the given key." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Get the public URL for the given key.\n" + } + ] + }, + "flags": {}, + "id": 1308, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_public_url", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Key of the record for which URL is required.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1309, + "kind": 32768, + "kindString": "Parameter", + "name": "key", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "str", + "type": "reference" + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "An abstract class for key-value store resource clients.\n\nThese clients are specific to the type of resource they manage and operate under a designated storage\nclient, like a memory storage client." + } + ] + }, + "decorations": [ + { + "args": "('Abstract classes')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 1284, + 1304, + 1279, + 1307, + 1290, + 1293, + 1286, + 1299, + 1296, + 1281 + ], + "title": "Methods" + } + ], + "id": 1278, + "module": "storage_clients._base._key_value_store_client", + "name": "KeyValueStoreClient", + "parsedDocstring": { + "text": "An abstract class for key-value store resource clients.\n\nThese clients are specific to the type of resource they manage and operate under a designated storage\nclient, like a memory storage client." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_base/_key_value_store_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 17 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Retrieve an existing dataset by its name or ID, or create a new one if it does not exist.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1311, + "module": "storage_clients._base._dataset_collection_client", + "name": "get_or_create", + "parsedDocstring": { + "text": "Retrieve an existing dataset by its name or ID, or create a new one if it does not exist.\n", + "args": { + "id": "Optional ID of the dataset to retrieve or create. If provided, the method will attempt\nto find a dataset with the ID.", + "name": "Optional name of the dataset resource to retrieve or create. If provided, the method will\nattempt to find a dataset with this name.", + "schema": "Optional schema for the dataset resource to be created.\n" + }, + "returns": "Metadata object containing the information of the retrieved or created dataset." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_base/_dataset_collection_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 20 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "Metadata object containing the information of the retrieved or created dataset." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Retrieve an existing dataset by its name or ID, or create a new one if it does not exist.\n" + } + ] + }, + "flags": {}, + "id": 1312, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_or_create", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Optional ID of the dataset to retrieve or create. If provided, the method will attempt\nto find a dataset with the ID." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1313, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Optional name of the dataset resource to retrieve or create. If provided, the method will\nattempt to find a dataset with this name." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1314, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Optional schema for the dataset resource to be created.\n" + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1315, + "kind": 32768, + "kindString": "Parameter", + "name": "schema", + "type": { + "name": "dict | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "dict" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "DatasetMetadata", + "type": "reference", + "target": "757" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "List the available datasets.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1316, + "module": "storage_clients._base._dataset_collection_client", + "name": "list", + "parsedDocstring": { + "text": "List the available datasets.\n", + "args": { + "unnamed": "Whether to list only the unnamed datasets.", + "limit": "Maximum number of datasets to return.", + "offset": "Number of datasets to skip from the beginning of the list.", + "desc": "Whether to sort the datasets in descending order.\n" + }, + "returns": "The list of available datasets matching the specified filters." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_base/_dataset_collection_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 41 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "The list of available datasets matching the specified filters." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "List the available datasets.\n" + } + ] + }, + "flags": {}, + "id": 1317, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "list", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Whether to list only the unnamed datasets." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1318, + "kind": 32768, + "kindString": "Parameter", + "name": "unnamed", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum number of datasets to return." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1319, + "kind": 32768, + "kindString": "Parameter", + "name": "limit", + "type": { + "name": "int | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Number of datasets to skip from the beginning of the list." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1320, + "kind": 32768, + "kindString": "Parameter", + "name": "offset", + "type": { + "name": "int | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Whether to sort the datasets in descending order.\n" + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1321, + "kind": 32768, + "kindString": "Parameter", + "name": "desc", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "DatasetListPage", + "type": "reference", + "target": "817" + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "An abstract class for dataset collection clients.\n\nThis collection client handles operations that involve multiple instances of a given resource type." + } + ] + }, + "decorations": [ + { + "args": "('Abstract classes')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 1311, + 1316 + ], + "title": "Methods" + } + ], + "id": 1310, + "module": "storage_clients._base._dataset_collection_client", + "name": "DatasetCollectionClient", + "parsedDocstring": { + "text": "An abstract class for dataset collection clients.\n\nThis collection client handles operations that involve multiple instances of a given resource type." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_base/_dataset_collection_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 13 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get metadata about the dataset being managed by this client.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1323, + "module": "storage_clients._base._dataset_client", + "name": "get", + "parsedDocstring": { + "text": "Get metadata about the dataset being managed by this client.\n", + "returns": "An object containing the dataset's details, or None if the dataset does not exist." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_base/_dataset_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 30 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "An object containing the dataset's details, or None if the dataset does not exist." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Get metadata about the dataset being managed by this client.\n" + } + ] + }, + "flags": {}, + "id": 1324, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get", + "parameters": [], + "type": { + "name": "DatasetMetadata | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "DatasetMetadata", + "target": "757" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Update the dataset metadata.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1325, + "module": "storage_clients._base._dataset_client", + "name": "update", + "parsedDocstring": { + "text": "Update the dataset metadata.\n", + "args": { + "name": "New new name for the dataset.\n" + }, + "returns": "An object reflecting the updated dataset metadata." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_base/_dataset_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 38 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "An object reflecting the updated dataset metadata." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Update the dataset metadata.\n" + } + ] + }, + "flags": {}, + "id": 1326, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "update", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "New new name for the dataset.\n" + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1327, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "DatasetMetadata", + "type": "reference", + "target": "757" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Permanently delete the dataset managed by this client." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1328, + "module": "storage_clients._base._dataset_client", + "name": "delete", + "parsedDocstring": { + "text": "Permanently delete the dataset managed by this client." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_base/_dataset_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 53 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Permanently delete the dataset managed by this client." + } + ] + }, + "flags": {}, + "id": 1329, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "delete", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Retrieve a paginated list of items from a dataset based on various filtering parameters.\n\nThis method provides the flexibility to filter, sort, and modify the appearance of dataset items\nwhen listed. Each parameter modifies the result set according to its purpose. The method also\nsupports pagination through 'offset' and 'limit' parameters.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1330, + "module": "storage_clients._base._dataset_client", + "name": "list_items", + "parsedDocstring": { + "text": "Retrieve a paginated list of items from a dataset based on various filtering parameters.\n\nThis method provides the flexibility to filter, sort, and modify the appearance of dataset items\nwhen listed. Each parameter modifies the result set according to its purpose. The method also\nsupports pagination through 'offset' and 'limit' parameters.\n", + "args": { + "offset": "The number of initial items to skip.", + "limit": "The maximum number of items to return.", + "clean": "If True, removes empty items and hidden fields, equivalent to 'skip_hidden' and 'skip_empty'.", + "desc": "If True, items are returned in descending order, i.e., newest first.", + "fields": "Specifies a subset of fields to include in each item.", + "omit": "Specifies a subset of fields to exclude from each item.", + "unwind": "Specifies a field that should be unwound. If it's an array, each element becomes a separate record.", + "skip_empty": "If True, omits items that are empty after other filters have been applied.", + "skip_hidden": "If True, omits fields starting with the '#' character.", + "flatten": "A list of fields to flatten in each item.", + "view": "The specific view of the dataset to use when retrieving items.\n" + }, + "returns": "An object with filtered, sorted, and paginated dataset items plus pagination details." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_base/_dataset_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 57 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "An object with filtered, sorted, and paginated dataset items plus pagination details." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Retrieve a paginated list of items from a dataset based on various filtering parameters.\n\nThis method provides the flexibility to filter, sort, and modify the appearance of dataset items\nwhen listed. Each parameter modifies the result set according to its purpose. The method also\nsupports pagination through 'offset' and 'limit' parameters.\n" + } + ] + }, + "flags": {}, + "id": 1331, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "list_items", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The number of initial items to skip." + } + ] + }, + "defaultValue": "0", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1332, + "kind": 32768, + "kindString": "Parameter", + "name": "offset", + "type": { + "name": "int | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The maximum number of items to return." + } + ] + }, + "defaultValue": "_LIST_ITEMS_LIMIT", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1333, + "kind": 32768, + "kindString": "Parameter", + "name": "limit", + "type": { + "name": "int | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, removes empty items and hidden fields, equivalent to 'skip_hidden' and 'skip_empty'." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1334, + "kind": 32768, + "kindString": "Parameter", + "name": "clean", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, items are returned in descending order, i.e., newest first." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1335, + "kind": 32768, + "kindString": "Parameter", + "name": "desc", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Specifies a subset of fields to include in each item." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1336, + "kind": 32768, + "kindString": "Parameter", + "name": "fields", + "type": { + "name": "list[str] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ], + "target": "866" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Specifies a subset of fields to exclude from each item." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1337, + "kind": 32768, + "kindString": "Parameter", + "name": "omit", + "type": { + "name": "list[str] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ], + "target": "866" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Specifies a field that should be unwound. If it's an array, each element becomes a separate record." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1338, + "kind": 32768, + "kindString": "Parameter", + "name": "unwind", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, omits items that are empty after other filters have been applied." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1339, + "kind": 32768, + "kindString": "Parameter", + "name": "skip_empty", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, omits fields starting with the '#' character." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1340, + "kind": 32768, + "kindString": "Parameter", + "name": "skip_hidden", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A list of fields to flatten in each item." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1341, + "kind": 32768, + "kindString": "Parameter", + "name": "flatten", + "type": { + "name": "list[str] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ], + "target": "866" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The specific view of the dataset to use when retrieving items.\n" + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1342, + "kind": 32768, + "kindString": "Parameter", + "name": "view", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "DatasetItemsListPage", + "type": "reference", + "target": "823" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Iterate over items in the dataset according to specified filters and sorting.\n\nThis method allows for asynchronously iterating through dataset items while applying various filters such as\nskipping empty items, hiding specific fields, and sorting. It supports pagination via `offset` and `limit`\nparameters, and can modify the appearance of dataset items using `fields`, `omit`, `unwind`, `skip_empty`, and\n`skip_hidden` parameters.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1343, + "module": "storage_clients._base._dataset_client", + "name": "iterate_items", + "parsedDocstring": { + "text": "Iterate over items in the dataset according to specified filters and sorting.\n\nThis method allows for asynchronously iterating through dataset items while applying various filters such as\nskipping empty items, hiding specific fields, and sorting. It supports pagination via `offset` and `limit`\nparameters, and can modify the appearance of dataset items using `fields`, `omit`, `unwind`, `skip_empty`, and\n`skip_hidden` parameters.\n", + "args": { + "offset": "The number of initial items to skip.", + "limit": "The maximum number of items to iterate over. None means no limit.", + "clean": "If True, removes empty items and hidden fields, equivalent to 'skip_hidden' and 'skip_empty'.", + "desc": "If set to True, items are returned in descending order, i.e., newest first.", + "fields": "Specifies a subset of fields to include in each item.", + "omit": "Specifies a subset of fields to exclude from each item.", + "unwind": "Specifies a field that should be unwound into separate items.", + "skip_empty": "If set to True, omits items that are empty after other filters have been applied.", + "skip_hidden": "If set to True, omits fields starting with the '#' character from the output.\n" + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_base/_dataset_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 96 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Iterate over items in the dataset according to specified filters and sorting.\n\nThis method allows for asynchronously iterating through dataset items while applying various filters such as\nskipping empty items, hiding specific fields, and sorting. It supports pagination via `offset` and `limit`\nparameters, and can modify the appearance of dataset items using `fields`, `omit`, `unwind`, `skip_empty`, and\n`skip_hidden` parameters.\n" + } + ] + }, + "flags": {}, + "id": 1344, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "iterate_items", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The number of initial items to skip." + } + ] + }, + "defaultValue": "0", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1345, + "kind": 32768, + "kindString": "Parameter", + "name": "offset", + "type": { + "name": "int", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The maximum number of items to iterate over. None means no limit." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1346, + "kind": 32768, + "kindString": "Parameter", + "name": "limit", + "type": { + "name": "int | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, removes empty items and hidden fields, equivalent to 'skip_hidden' and 'skip_empty'." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1347, + "kind": 32768, + "kindString": "Parameter", + "name": "clean", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "If set to True, items are returned in descending order, i.e., newest first." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1348, + "kind": 32768, + "kindString": "Parameter", + "name": "desc", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Specifies a subset of fields to include in each item." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1349, + "kind": 32768, + "kindString": "Parameter", + "name": "fields", + "type": { + "name": "list[str] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ], + "target": "866" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Specifies a subset of fields to exclude from each item." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1350, + "kind": 32768, + "kindString": "Parameter", + "name": "omit", + "type": { + "name": "list[str] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ], + "target": "866" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Specifies a field that should be unwound into separate items." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1351, + "kind": 32768, + "kindString": "Parameter", + "name": "unwind", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "If set to True, omits items that are empty after other filters have been applied." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1352, + "kind": 32768, + "kindString": "Parameter", + "name": "skip_empty", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "If set to True, omits fields starting with the '#' character from the output.\n" + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1353, + "kind": 32768, + "kindString": "Parameter", + "name": "skip_hidden", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "AsyncIterator", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "dict" + } + ] + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Retrieve dataset items as bytes.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1354, + "module": "storage_clients._base._dataset_client", + "name": "get_items_as_bytes", + "parsedDocstring": { + "text": "Retrieve dataset items as bytes.\n", + "args": { + "item_format": "Output format (e.g., 'json', 'csv'); default is 'json'.", + "offset": "Number of items to skip; default is 0.", + "limit": "Max number of items to return; no default limit.", + "desc": "If True, results are returned in descending order.", + "clean": "If True, filters out empty items and hidden fields.", + "bom": "Include or exclude UTF-8 BOM; default behavior varies by format.", + "delimiter": "Delimiter character for CSV; default is ','.", + "fields": "List of fields to include in the results.", + "omit": "List of fields to omit from the results.", + "unwind": "Unwinds a field into separate records.", + "skip_empty": "If True, skips empty items in the output.", + "skip_header_row": "If True, skips the header row in CSV.", + "skip_hidden": "If True, skips hidden fields in the output.", + "xml_root": "Root element name for XML output; default is 'items'.", + "xml_row": "Element name for each item in XML output; default is 'item'.", + "flatten": "List of fields to flatten.\n" + }, + "returns": "The dataset items as raw bytes." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_base/_dataset_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 138 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "The dataset items as raw bytes." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Retrieve dataset items as bytes.\n" + } + ] + }, + "flags": {}, + "id": 1355, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_items_as_bytes", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Output format (e.g., 'json', 'csv'); default is 'json'." + } + ] + }, + "defaultValue": "'json'", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1356, + "kind": 32768, + "kindString": "Parameter", + "name": "item_format", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Number of items to skip; default is 0." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1357, + "kind": 32768, + "kindString": "Parameter", + "name": "offset", + "type": { + "name": "int | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Max number of items to return; no default limit." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1358, + "kind": 32768, + "kindString": "Parameter", + "name": "limit", + "type": { + "name": "int | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, results are returned in descending order." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1359, + "kind": 32768, + "kindString": "Parameter", + "name": "desc", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, filters out empty items and hidden fields." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1360, + "kind": 32768, + "kindString": "Parameter", + "name": "clean", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Include or exclude UTF-8 BOM; default behavior varies by format." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1361, + "kind": 32768, + "kindString": "Parameter", + "name": "bom", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Delimiter character for CSV; default is ','." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1362, + "kind": 32768, + "kindString": "Parameter", + "name": "delimiter", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "List of fields to include in the results." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1363, + "kind": 32768, + "kindString": "Parameter", + "name": "fields", + "type": { + "name": "list[str] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ], + "target": "866" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "List of fields to omit from the results." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1364, + "kind": 32768, + "kindString": "Parameter", + "name": "omit", + "type": { + "name": "list[str] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ], + "target": "866" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Unwinds a field into separate records." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1365, + "kind": 32768, + "kindString": "Parameter", + "name": "unwind", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, skips empty items in the output." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1366, + "kind": 32768, + "kindString": "Parameter", + "name": "skip_empty", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, skips the header row in CSV." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1367, + "kind": 32768, + "kindString": "Parameter", + "name": "skip_header_row", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, skips hidden fields in the output." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1368, + "kind": 32768, + "kindString": "Parameter", + "name": "skip_hidden", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Root element name for XML output; default is 'items'." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1369, + "kind": 32768, + "kindString": "Parameter", + "name": "xml_root", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Element name for each item in XML output; default is 'item'." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1370, + "kind": 32768, + "kindString": "Parameter", + "name": "xml_row", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "List of fields to flatten.\n" + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1371, + "kind": 32768, + "kindString": "Parameter", + "name": "flatten", + "type": { + "name": "list[str] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ], + "target": "866" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "bytes", + "type": "reference", + "target": "3048" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Retrieve dataset items as a streaming response.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1372, + "module": "storage_clients._base._dataset_client", + "name": "stream_items", + "parsedDocstring": { + "text": "Retrieve dataset items as a streaming response.\n", + "args": { + "item_format": "Output format, options include json, jsonl, csv, html, xlsx, xml, rss; default is json.", + "offset": "Number of items to skip at the start; default is 0.", + "limit": "Maximum number of items to return; no default limit.", + "desc": "If True, reverses the order of results.", + "clean": "If True, filters out empty items and hidden fields.", + "bom": "Include or exclude UTF-8 BOM; varies by format.", + "delimiter": "Delimiter for CSV files; default is ','.", + "fields": "List of fields to include in the output.", + "omit": "List of fields to omit from the output.", + "unwind": "Unwinds a field into separate records.", + "skip_empty": "If True, empty items are omitted.", + "skip_header_row": "If True, skips the header row in CSV.", + "skip_hidden": "If True, hides fields starting with the # character.", + "xml_root": "Custom root element name for XML output; default is 'items'.", + "xml_row": "Custom element name for each item in XML; default is 'item'.\n" + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_base/_dataset_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 183 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Retrieve dataset items as a streaming response.\n" + } + ] + }, + "flags": {}, + "id": 1373, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "stream_items", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Output format, options include json, jsonl, csv, html, xlsx, xml, rss; default is json." + } + ] + }, + "defaultValue": "'json'", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1374, + "kind": 32768, + "kindString": "Parameter", + "name": "item_format", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Number of items to skip at the start; default is 0." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1375, + "kind": 32768, + "kindString": "Parameter", + "name": "offset", + "type": { + "name": "int | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum number of items to return; no default limit." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1376, + "kind": 32768, + "kindString": "Parameter", + "name": "limit", + "type": { + "name": "int | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, reverses the order of results." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1377, + "kind": 32768, + "kindString": "Parameter", + "name": "desc", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, filters out empty items and hidden fields." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1378, + "kind": 32768, + "kindString": "Parameter", + "name": "clean", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Include or exclude UTF-8 BOM; varies by format." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1379, + "kind": 32768, + "kindString": "Parameter", + "name": "bom", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Delimiter for CSV files; default is ','." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1380, + "kind": 32768, + "kindString": "Parameter", + "name": "delimiter", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "List of fields to include in the output." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1381, + "kind": 32768, + "kindString": "Parameter", + "name": "fields", + "type": { + "name": "list[str] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ], + "target": "866" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "List of fields to omit from the output." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1382, + "kind": 32768, + "kindString": "Parameter", + "name": "omit", + "type": { + "name": "list[str] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ], + "target": "866" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Unwinds a field into separate records." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1383, + "kind": 32768, + "kindString": "Parameter", + "name": "unwind", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, empty items are omitted." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1384, + "kind": 32768, + "kindString": "Parameter", + "name": "skip_empty", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, skips the header row in CSV." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1385, + "kind": 32768, + "kindString": "Parameter", + "name": "skip_header_row", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, hides fields starting with the # character." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1386, + "kind": 32768, + "kindString": "Parameter", + "name": "skip_hidden", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Custom root element name for XML output; default is 'items'." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1387, + "kind": 32768, + "kindString": "Parameter", + "name": "xml_root", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Custom element name for each item in XML; default is 'item'.\n" + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1388, + "kind": 32768, + "kindString": "Parameter", + "name": "xml_row", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "AbstractAsyncContextManager", + "type": "reference", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "Response" + }, + { + "type": "literal", + "value": null + } + ] + } + ] + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Push items to the dataset.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1389, + "module": "storage_clients._base._dataset_client", + "name": "push_items", + "parsedDocstring": { + "text": "Push items to the dataset.\n", + "args": { + "items": "The items which to push in the dataset. They must be JSON serializable." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_base/_dataset_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 226 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Push items to the dataset.\n" + } + ] + }, + "flags": {}, + "id": 1390, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "push_items", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The items which to push in the dataset. They must be JSON serializable." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1391, + "kind": 32768, + "kindString": "Parameter", + "name": "items", + "type": { + "name": "JsonSerializable", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "An abstract class for dataset resource clients.\n\nThese clients are specific to the type of resource they manage and operate under a designated storage\nclient, like a memory storage client." + } + ] + }, + "decorations": [ + { + "args": "('Abstract classes')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 1328, + 1323, + 1354, + 1343, + 1330, + 1389, + 1372, + 1325 + ], + "title": "Methods" + } + ], + "id": 1322, + "module": "storage_clients._base._dataset_client", + "name": "DatasetClient", + "parsedDocstring": { + "text": "An abstract class for dataset resource clients.\n\nThese clients are specific to the type of resource they manage and operate under a designated storage\nclient, like a memory storage client." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storage_clients/_base/_dataset_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 19 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1392, + "module": "statistics._statistics", + "name": "TStatisticsState", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_statistics.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 22 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1393, + "module": "statistics._statistics", + "name": "TNewStatisticsState", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_statistics.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 23 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1394, + "module": "statistics._statistics", + "name": "logger", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_statistics.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 24 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1396, + "module": "statistics._statistics", + "name": "__init__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_statistics.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 30 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 1397, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Mark the job as started." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1398, + "module": "statistics._statistics", + "name": "run", + "parsedDocstring": { + "text": "Mark the job as started." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_statistics.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 35 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Mark the job as started." + } + ] + }, + "flags": {}, + "id": 1399, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "run", + "parameters": [], + "type": { + "name": "int", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Mark the job as finished." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1400, + "module": "statistics._statistics", + "name": "finish", + "parsedDocstring": { + "text": "Mark the job as finished." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_statistics.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 41 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Mark the job as finished." + } + ] + }, + "flags": {}, + "id": 1401, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "finish", + "parameters": [], + "type": { + "name": "timedelta", + "type": "reference" + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Number of times the job has been retried." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 1402, + "module": "statistics._statistics", + "name": "retry_count", + "parsedDocstring": { + "text": "Number of times the job has been retried." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_statistics.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 50 + } + ], + "type": { + "name": "int", + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Tracks information about the processing of a request." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 1396, + 1400, + 1398 + ], + "title": "Methods" + }, + { + "children": [ + 1402 + ], + "title": "Properties" + } + ], + "id": 1395, + "module": "statistics._statistics", + "name": "RequestProcessingRecord", + "parsedDocstring": { + "text": "Tracks information about the processing of a request." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_statistics.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 27 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1404, + "module": "statistics._statistics", + "name": "__init__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_statistics.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 67 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 1405, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1406, + "kind": 32768, + "kindString": "Parameter", + "name": "persistence_enabled", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1407, + "kind": 32768, + "kindString": "Parameter", + "name": "persist_state_kvs_name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1408, + "kind": 32768, + "kindString": "Parameter", + "name": "persist_state_key", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "'Statistics'", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1409, + "kind": 32768, + "kindString": "Parameter", + "name": "log_message", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1410, + "kind": 32768, + "kindString": "Parameter", + "name": "periodic_message_logger", + "type": { + "name": "Logger | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Logger" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "timedelta(minutes=1)", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1411, + "kind": 32768, + "kindString": "Parameter", + "name": "log_interval", + "type": { + "name": "timedelta", + "type": "reference" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 1412, + "kind": 32768, + "kindString": "Parameter", + "name": "state_model", + "type": { + "name": "type", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "TStatisticsState", + "target": "1392" + } + ] + } + }, + { + "defaultValue": "'table'", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1413, + "kind": 32768, + "kindString": "Parameter", + "name": "statistics_log_format", + "type": { + "name": "Literal", + "type": "reference", + "typeArguments": [ + { + "type": "literal", + "value": "table" + }, + { + "type": "literal", + "value": "inline" + } + ] + } + }, + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1414, + "kind": 32768, + "kindString": "Parameter", + "name": "save_error_snapshots", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "Statistics.__init__", + "target": 1404, + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create near copy of the `Statistics` with replaced `state_model`." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1415, + "module": "statistics._statistics", + "name": "replace_state_model", + "parsedDocstring": { + "text": "Create near copy of the `Statistics` with replaced `state_model`." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_statistics.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 109 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create near copy of the `Statistics` with replaced `state_model`." + } + ] + }, + "flags": {}, + "id": 1416, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "replace_state_model", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1417, + "kind": 32768, + "kindString": "Parameter", + "name": "state_model", + "type": { + "name": "type", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "TNewStatisticsState", + "target": "1393" + } + ] + } + } + ], + "type": { + "name": "Statistics", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "TNewStatisticsState", + "target": "1393" + } + ], + "target": "1403" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance with default state model `StatisticsState`." + } + ] + }, + "decorations": [ + { + "name": "staticmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1418, + "module": "statistics._statistics", + "name": "with_default_state", + "parsedDocstring": { + "text": "Initialize a new instance with default state model `StatisticsState`." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_statistics.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 123 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance with default state model `StatisticsState`." + } + ] + }, + "flags": {}, + "id": 1419, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "with_default_state", + "parameters": [ + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1420, + "kind": 32768, + "kindString": "Parameter", + "name": "persistence_enabled", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1421, + "kind": 32768, + "kindString": "Parameter", + "name": "persist_state_kvs_name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1422, + "kind": 32768, + "kindString": "Parameter", + "name": "persist_state_key", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "'Statistics'", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1423, + "kind": 32768, + "kindString": "Parameter", + "name": "log_message", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1424, + "kind": 32768, + "kindString": "Parameter", + "name": "periodic_message_logger", + "type": { + "name": "Logger | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Logger" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "timedelta(minutes=1)", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1425, + "kind": 32768, + "kindString": "Parameter", + "name": "log_interval", + "type": { + "name": "timedelta", + "type": "reference" + } + }, + { + "defaultValue": "'table'", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1426, + "kind": 32768, + "kindString": "Parameter", + "name": "statistics_log_format", + "type": { + "name": "Literal", + "type": "reference", + "typeArguments": [ + { + "type": "literal", + "value": "table" + }, + { + "type": "literal", + "value": "inline" + } + ] + } + }, + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1427, + "kind": 32768, + "kindString": "Parameter", + "name": "save_error_snapshots", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "Statistics", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "StatisticsState", + "target": "1470" + } + ], + "target": "1403" + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Indicate whether the context is active." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 1428, + "module": "statistics._statistics", + "name": "active", + "parsedDocstring": { + "text": "Indicate whether the context is active." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_statistics.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 148 + } + ], + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Subscribe to events and start collecting statistics.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1429, + "module": "statistics._statistics", + "name": "__aenter__", + "parsedDocstring": { + "text": "Subscribe to events and start collecting statistics.\n" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_statistics.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 152 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Subscribe to events and start collecting statistics.\n" + } + ] + }, + "flags": {}, + "id": 1430, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "__aenter__", + "parameters": [], + "type": { + "name": "Self", + "type": "reference" + }, + "overwrites": { + "name": "Statistics.__aenter__", + "target": 1429, + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Stop collecting statistics.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1431, + "module": "statistics._statistics", + "name": "__aexit__", + "parsedDocstring": { + "text": "Stop collecting statistics.\n" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_statistics.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 171 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Stop collecting statistics.\n" + } + ] + }, + "flags": {}, + "id": 1432, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "__aexit__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1433, + "kind": 32768, + "kindString": "Parameter", + "name": "exc_type", + "type": { + "name": "type[BaseException] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "type", + "typeArguments": [ + { + "type": "reference", + "name": "BaseException" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1434, + "kind": 32768, + "kindString": "Parameter", + "name": "exc_value", + "type": { + "name": "BaseException | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "BaseException" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1435, + "kind": 32768, + "kindString": "Parameter", + "name": "exc_traceback", + "type": { + "name": "TracebackType | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "TracebackType" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "Statistics.__aexit__", + "target": 1431, + "type": "reference" + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 1436, + "module": "statistics._statistics", + "name": "state", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_statistics.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 194 + } + ], + "type": { + "name": "TStatisticsState", + "type": "reference", + "target": "1392" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Increment the number of times a status code has been received." + } + ] + }, + "decorations": [ + { + "name": "ensure_context" + } + ], + "flags": {}, + "groups": [], + "id": 1437, + "module": "statistics._statistics", + "name": "register_status_code", + "parsedDocstring": { + "text": "Increment the number of times a status code has been received." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_statistics.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 198 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Increment the number of times a status code has been received." + } + ] + }, + "flags": {}, + "id": 1438, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "register_status_code", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1439, + "kind": 32768, + "kindString": "Parameter", + "name": "code", + "type": { + "name": "int", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Mark a request as started." + } + ] + }, + "decorations": [ + { + "name": "ensure_context" + } + ], + "flags": {}, + "groups": [], + "id": 1440, + "module": "statistics._statistics", + "name": "record_request_processing_start", + "parsedDocstring": { + "text": "Mark a request as started." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_statistics.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 205 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Mark a request as started." + } + ] + }, + "flags": {}, + "id": 1441, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "record_request_processing_start", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1442, + "kind": 32768, + "kindString": "Parameter", + "name": "request_id_or_key", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Mark a request as finished." + } + ] + }, + "decorations": [ + { + "name": "ensure_context" + } + ], + "flags": {}, + "groups": [], + "id": 1443, + "module": "statistics._statistics", + "name": "record_request_processing_finish", + "parsedDocstring": { + "text": "Mark a request as finished." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_statistics.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 212 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Mark a request as finished." + } + ] + }, + "flags": {}, + "id": 1444, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "record_request_processing_finish", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1445, + "kind": 32768, + "kindString": "Parameter", + "name": "request_id_or_key", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Mark a request as failed." + } + ] + }, + "decorations": [ + { + "name": "ensure_context" + } + ], + "flags": {}, + "groups": [], + "id": 1446, + "module": "statistics._statistics", + "name": "record_request_processing_failure", + "parsedDocstring": { + "text": "Mark a request as failed." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_statistics.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 234 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Mark a request as failed." + } + ] + }, + "flags": {}, + "id": 1447, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "record_request_processing_failure", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1448, + "kind": 32768, + "kindString": "Parameter", + "name": "request_id_or_key", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Calculate the current statistics." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1449, + "module": "statistics._statistics", + "name": "calculate", + "parsedDocstring": { + "text": "Calculate the current statistics." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_statistics.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 248 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Calculate the current statistics." + } + ] + }, + "flags": {}, + "id": 1450, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "calculate", + "parameters": [], + "type": { + "name": "FinalStatistics", + "type": "reference", + "target": "1453" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Reset the statistics to their defaults and remove any persistent state." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1451, + "module": "statistics._statistics", + "name": "reset", + "parsedDocstring": { + "text": "Reset the statistics to their defaults and remove any persistent state." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_statistics.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 271 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Reset the statistics to their defaults and remove any persistent state." + } + ] + }, + "flags": {}, + "id": 1452, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "reset", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A class for collecting, tracking, and logging runtime statistics for requests.\n\nIt is designed to record information such as request durations, retries, successes, and failures, enabling\nanalysis of crawler performance. The collected statistics are persisted to a `KeyValueStore`, ensuring they\nremain available across crawler migrations, abortions, and restarts. This persistence allows for tracking\nand evaluation of crawler behavior over its lifecycle." + } + ] + }, + "decorations": [ + { + "args": "('Classes')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 1429, + 1431, + 1404, + 1449, + 1446, + 1443, + 1440, + 1437, + 1415, + 1451, + 1418 + ], + "title": "Methods" + }, + { + "children": [ + 1428, + 1436 + ], + "title": "Properties" + } + ], + "id": 1403, + "module": "statistics._statistics", + "name": "Statistics", + "parsedDocstring": { + "text": "A class for collecting, tracking, and logging runtime statistics for requests.\n\nIt is designed to record information such as request durations, retries, successes, and failures, enabling\nanalysis of crawler performance. The collected statistics are persisted to a `KeyValueStore`, ensuring they\nremain available across crawler migrations, abortions, and restarts. This persistence allows for tracking\nand evaluation of crawler behavior over its lifecycle." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_statistics.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 56 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedBy": [ + { + "name": "_NonPersistentStatistics", + "target": "2573", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1454, + "module": "statistics._models", + "name": "requests_finished", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 21 + } + ], + "type": { + "name": "int", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1455, + "module": "statistics._models", + "name": "requests_failed", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 22 + } + ], + "type": { + "name": "int", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1456, + "module": "statistics._models", + "name": "retry_histogram", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 23 + } + ], + "type": { + "name": "list", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ], + "target": "866" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1457, + "module": "statistics._models", + "name": "request_avg_failed_duration", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 24 + } + ], + "type": { + "name": "timedelta | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "timedelta" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1458, + "module": "statistics._models", + "name": "request_avg_finished_duration", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 25 + } + ], + "type": { + "name": "timedelta | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "timedelta" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1459, + "module": "statistics._models", + "name": "requests_finished_per_minute", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 26 + } + ], + "type": { + "name": "float", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1460, + "module": "statistics._models", + "name": "requests_failed_per_minute", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 27 + } + ], + "type": { + "name": "float", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1461, + "module": "statistics._models", + "name": "request_total_duration", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 28 + } + ], + "type": { + "name": "timedelta", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1462, + "module": "statistics._models", + "name": "requests_total", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 29 + } + ], + "type": { + "name": "int", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1463, + "module": "statistics._models", + "name": "crawler_runtime", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 30 + } + ], + "type": { + "name": "timedelta", + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Print out the Final Statistics data as a table." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1464, + "module": "statistics._models", + "name": "to_table", + "parsedDocstring": { + "text": "Print out the Final Statistics data as a table." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 32 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Print out the Final Statistics data as a table." + } + ] + }, + "flags": {}, + "id": 1465, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "to_table", + "parameters": [], + "type": { + "name": "str", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1466, + "module": "statistics._models", + "name": "to_dict", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 38 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 1467, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "to_dict", + "parameters": [], + "type": { + "name": "dict", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "union", + "types": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "float" + }, + { + "type": "reference", + "name": "int" + } + ] + }, + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ], + "target": "866" + } + ] + } + ] + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "name": "override" + } + ], + "flags": {}, + "groups": [], + "id": 1468, + "module": "statistics._models", + "name": "__str__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 42 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 1469, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__str__", + "parameters": [], + "type": { + "name": "str", + "type": "reference" + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Statistics about a crawler run." + } + ] + }, + "decorations": [ + { + "args": "(frozen=True)", + "name": "dataclass" + }, + { + "args": "('Data structures')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 1468, + 1466, + 1464 + ], + "title": "Methods" + }, + { + "children": [ + 1463, + 1457, + 1458, + 1461, + 1455, + 1460, + 1454, + 1459, + 1462, + 1456 + ], + "title": "Properties" + } + ], + "id": 1453, + "module": "statistics._models", + "name": "FinalStatistics", + "parsedDocstring": { + "text": "Statistics about a crawler run." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 18 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1471, + "module": "statistics._models", + "name": "model_config", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 52 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1472, + "module": "statistics._models", + "name": "stats_id", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 53 + } + ], + "type": { + "name": "Annotated[int | None, Field(alias='statsId')]", + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1473, + "module": "statistics._models", + "name": "requests_finished", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 55 + } + ], + "type": { + "name": "int", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1474, + "module": "statistics._models", + "name": "requests_failed", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 56 + } + ], + "type": { + "name": "int", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1475, + "module": "statistics._models", + "name": "requests_retries", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 57 + } + ], + "type": { + "name": "int", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1476, + "module": "statistics._models", + "name": "requests_failed_per_minute", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 58 + } + ], + "type": { + "name": "float", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1477, + "module": "statistics._models", + "name": "requests_finished_per_minute", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 59 + } + ], + "type": { + "name": "float", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1478, + "module": "statistics._models", + "name": "request_min_duration", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 60 + } + ], + "type": { + "name": "Annotated[timedelta_ms | None, Field(alias='requestMinDurationMillis')]", + "type": "union", + "types": [ + { + "type": "reference", + "name": "timedelta_ms", + "target": "2973" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1479, + "module": "statistics._models", + "name": "request_max_duration", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 61 + } + ], + "type": { + "name": "Annotated[timedelta_ms | None, Field(alias='requestMaxDurationMillis')]", + "type": "union", + "types": [ + { + "type": "reference", + "name": "timedelta_ms", + "target": "2973" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1480, + "module": "statistics._models", + "name": "request_total_failed_duration", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 62 + } + ], + "type": { + "name": "timedelta_ms", + "type": "reference", + "target": "2973" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1481, + "module": "statistics._models", + "name": "request_total_finished_duration", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 65 + } + ], + "type": { + "name": "timedelta_ms", + "type": "reference", + "target": "2973" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1482, + "module": "statistics._models", + "name": "crawler_started_at", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 68 + } + ], + "type": { + "name": "Annotated[datetime | None, Field(alias='crawlerStartedAt')]", + "type": "union", + "types": [ + { + "type": "reference", + "name": "datetime" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1483, + "module": "statistics._models", + "name": "crawler_last_started_at", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 69 + } + ], + "type": { + "name": "Annotated[datetime | None, Field(alias='crawlerLastStartTimestamp')]", + "type": "union", + "types": [ + { + "type": "reference", + "name": "datetime" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1484, + "module": "statistics._models", + "name": "crawler_finished_at", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 70 + } + ], + "type": { + "name": "Annotated[datetime | None, Field(alias='crawlerFinishedAt')]", + "type": "union", + "types": [ + { + "type": "reference", + "name": "datetime" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1485, + "module": "statistics._models", + "name": "crawler_runtime", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 71 + } + ], + "type": { + "name": "timedelta_ms", + "type": "reference", + "target": "2973" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1486, + "module": "statistics._models", + "name": "errors", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 72 + } + ], + "type": { + "name": "dict", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Any" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1487, + "module": "statistics._models", + "name": "retry_errors", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 73 + } + ], + "type": { + "name": "dict", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Any" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1488, + "module": "statistics._models", + "name": "requests_with_status_code", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 74 + } + ], + "type": { + "name": "dict", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "int" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1489, + "module": "statistics._models", + "name": "stats_persisted_at", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 75 + } + ], + "type": { + "name": "Annotated[ datetime | None, Field(alias='statsPersistedAt'), PlainSerializer(lambda _: datetime.now(timezone.utc)) ]", + "type": "union", + "types": [ + { + "type": "reference", + "name": "datetime" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1490, + "module": "statistics._models", + "name": "request_retry_histogram", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 78 + } + ], + "type": { + "name": "dict", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "reference", + "name": "int" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "args": "(alias='requestTotalDurationMillis', return_type=timedelta_ms)", + "name": "computed_field" + }, + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 1491, + "module": "statistics._models", + "name": "request_total_duration", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 90 + } + ], + "type": { + "name": "timedelta", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "args": "(alias='requestAvgFailedDurationMillis', return_type=Optional[timedelta_ms])", + "name": "computed_field" + }, + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 1492, + "module": "statistics._models", + "name": "request_avg_failed_duration", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 95 + } + ], + "type": { + "name": "timedelta | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "timedelta" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "args": "(alias='requestAvgFinishedDurationMillis', return_type=Optional[timedelta_ms])", + "name": "computed_field" + }, + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 1493, + "module": "statistics._models", + "name": "request_avg_finished_duration", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 100 + } + ], + "type": { + "name": "timedelta | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "timedelta" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "args": "(alias='requestsTotal')", + "name": "computed_field" + }, + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 1494, + "module": "statistics._models", + "name": "requests_total", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 105 + } + ], + "type": { + "name": "int", + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Statistic data about a crawler run." + } + ] + }, + "decorations": [ + { + "args": "('Data structures')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 1484, + 1483, + 1485, + 1482, + 1486, + 1471, + 1492, + 1493, + 1479, + 1478, + 1490, + 1491, + 1480, + 1481, + 1474, + 1476, + 1473, + 1477, + 1475, + 1494, + 1488, + 1487, + 1472, + 1489 + ], + "title": "Properties" + } + ], + "id": 1470, + "module": "statistics._models", + "name": "StatisticsState", + "parsedDocstring": { + "text": "Statistic data about a crawler run." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 49 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedBy": [ + { + "name": "AdaptivePlaywrightCrawlerStatisticState", + "target": "2565", + "type": "reference" + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1495, + "module": "statistics._error_tracker", + "name": "GroupName", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_error_tracker.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 16 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1496, + "module": "statistics._error_tracker", + "name": "ErrorFilenameGroups", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_error_tracker.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 17 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1497, + "module": "statistics._error_tracker", + "name": "logger", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_error_tracker.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 20 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1499, + "module": "statistics._error_tracker", + "name": "__init__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_error_tracker.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 26 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 1500, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1501, + "kind": 32768, + "kindString": "Parameter", + "name": "snapshot_kvs_name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "True", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1502, + "kind": 32768, + "kindString": "Parameter", + "name": "show_error_name", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "defaultValue": "True", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1503, + "kind": 32768, + "kindString": "Parameter", + "name": "show_file_and_line_number", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "defaultValue": "True", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1504, + "kind": 32768, + "kindString": "Parameter", + "name": "show_error_message", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1505, + "kind": 32768, + "kindString": "Parameter", + "name": "show_full_message", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1506, + "kind": 32768, + "kindString": "Parameter", + "name": "save_error_snapshots", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Add an error in the statistics.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1507, + "module": "statistics._error_tracker", + "name": "add", + "parsedDocstring": { + "text": "Add an error in the statistics.\n", + "args": { + "error": "Error to be added to statistics.", + "context": "Context used to collect error snapshot.", + "early": "Flag indicating that the error is added earlier than usual to have access to resources that will be\nclosed before normal error collection. This prevents double reporting during normal error collection." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_error_tracker.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 46 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Add an error in the statistics.\n" + } + ] + }, + "flags": {}, + "id": 1508, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "add", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Error to be added to statistics." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1509, + "kind": 32768, + "kindString": "Parameter", + "name": "error", + "type": { + "name": "Exception", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Context used to collect error snapshot." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1510, + "kind": 32768, + "kindString": "Parameter", + "name": "context", + "type": { + "name": "BasicCrawlingContext | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "BasicCrawlingContext", + "target": "309" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Flag indicating that the error is added earlier than usual to have access to resources that will be\nclosed before normal error collection. This prevents double reporting during normal error collection." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1511, + "kind": 32768, + "kindString": "Parameter", + "name": "early", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Number of distinct kinds of errors." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 1512, + "module": "statistics._error_tracker", + "name": "unique_error_count", + "parsedDocstring": { + "text": "Number of distinct kinds of errors." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_error_tracker.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 136 + } + ], + "type": { + "name": "int", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Total number of errors." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 1513, + "module": "statistics._error_tracker", + "name": "total", + "parsedDocstring": { + "text": "Total number of errors." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_error_tracker.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 145 + } + ], + "type": { + "name": "int", + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return n most common errors." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1514, + "module": "statistics._error_tracker", + "name": "get_most_common_errors", + "parsedDocstring": { + "text": "Return n most common errors." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_error_tracker.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 153 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return n most common errors." + } + ] + }, + "flags": {}, + "id": 1515, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "get_most_common_errors", + "parameters": [ + { + "defaultValue": "3", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1516, + "kind": 32768, + "kindString": "Parameter", + "name": "n", + "type": { + "name": "int", + "type": "reference" + } + } + ], + "type": { + "name": "list", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "tuple", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + }, + { + "type": "reference", + "name": "int" + } + ] + } + ], + "target": "866" + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Track errors and aggregates their counts by similarity." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 1499, + 1507, + 1514 + ], + "title": "Methods" + }, + { + "children": [ + 1513, + 1512 + ], + "title": "Properties" + } + ], + "id": 1498, + "module": "statistics._error_tracker", + "name": "ErrorTracker", + "parsedDocstring": { + "text": "Track errors and aggregates their counts by similarity." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_error_tracker.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 23 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1518, + "module": "statistics._error_snapshotter", + "name": "MAX_ERROR_CHARACTERS", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_error_snapshotter.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 16 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1519, + "module": "statistics._error_snapshotter", + "name": "MAX_HASH_LENGTH", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_error_snapshotter.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 17 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1520, + "module": "statistics._error_snapshotter", + "name": "MAX_FILENAME_LENGTH", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_error_snapshotter.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 18 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1521, + "module": "statistics._error_snapshotter", + "name": "BASE_MESSAGE", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_error_snapshotter.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 19 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1522, + "module": "statistics._error_snapshotter", + "name": "SNAPSHOT_PREFIX", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_error_snapshotter.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 20 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1523, + "module": "statistics._error_snapshotter", + "name": "ALLOWED_CHARACTERS", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_error_snapshotter.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 21 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1524, + "module": "statistics._error_snapshotter", + "name": "__init__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_error_snapshotter.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 23 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 1525, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1526, + "kind": 32768, + "kindString": "Parameter", + "name": "snapshot_kvs_name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Capture error snapshot and save it to key value store.\n\nIt saves the error snapshot directly to a key value store. It can't use `context.get_key_value_store` because\nit returns `KeyValueStoreChangeRecords` which is committed to the key value store only if the `RequestHandler`\nreturned without an exception. ErrorSnapshotter is on the contrary active only when `RequestHandler` fails with\nan exception.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1527, + "module": "statistics._error_snapshotter", + "name": "capture_snapshot", + "parsedDocstring": { + "text": "Capture error snapshot and save it to key value store.\n\nIt saves the error snapshot directly to a key value store. It can't use `context.get_key_value_store` because\nit returns `KeyValueStoreChangeRecords` which is committed to the key value store only if the `RequestHandler`\nreturned without an exception. ErrorSnapshotter is on the contrary active only when `RequestHandler` fails with\nan exception.\n", + "args": { + "error_message": "Used in filename of the snapshot.", + "file_and_line": "Used in filename of the snapshot.", + "context": "Context that is used to get the snapshot." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_error_snapshotter.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 26 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Capture error snapshot and save it to key value store.\n\nIt saves the error snapshot directly to a key value store. It can't use `context.get_key_value_store` because\nit returns `KeyValueStoreChangeRecords` which is committed to the key value store only if the `RequestHandler`\nreturned without an exception. ErrorSnapshotter is on the contrary active only when `RequestHandler` fails with\nan exception.\n" + } + ] + }, + "flags": {}, + "id": 1528, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "capture_snapshot", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Used in filename of the snapshot." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1529, + "kind": 32768, + "kindString": "Parameter", + "name": "error_message", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Used in filename of the snapshot." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1530, + "kind": 32768, + "kindString": "Parameter", + "name": "file_and_line", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Context that is used to get the snapshot." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1531, + "kind": 32768, + "kindString": "Parameter", + "name": "context", + "type": { + "name": "BasicCrawlingContext", + "type": "reference", + "target": "309" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 1524, + 1527 + ], + "title": "Methods" + }, + { + "children": [ + 1523, + 1521, + 1518, + 1520, + 1519, + 1522 + ], + "title": "Properties" + } + ], + "id": 1517, + "module": "statistics._error_snapshotter", + "name": "ErrorSnapshotter", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_error_snapshotter.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 15 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1532, + "module": "sessions._session_pool", + "name": "logger", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/sessions/_session_pool.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 21 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1533, + "module": "sessions._session_pool", + "name": "CreateSessionFunctionType", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/sessions/_session_pool.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 23 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1535, + "module": "sessions._session_pool", + "name": "__init__", + "parsedDocstring": { + "text": "Initialize a new instance.\n", + "args": { + "max_pool_size": "Maximum number of sessions to maintain in the pool. You can add more sessions to the pool\nby using the `add_session` method.", + "create_session_settings": "Settings for creating new session instances. If None, default settings will\nbe used. Do not set it if you are providing a `create_session_function`.", + "create_session_function": "A callable to create new session instances. If None, a default session settings\nwill be used. Do not set it if you are providing `create_session_settings`.", + "event_manager": "The event manager to handle events like persist state.", + "persistence_enabled": "Flag to enable or disable state persistence of the pool.", + "persist_state_kvs_name": "The name of the `KeyValueStore` used for state persistence.", + "persist_state_key": "The key under which the session pool's state is stored in the `KeyValueStore`." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/sessions/_session_pool.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 35 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n" + } + ] + }, + "flags": {}, + "id": 1536, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum number of sessions to maintain in the pool. You can add more sessions to the pool\nby using the `add_session` method." + } + ] + }, + "defaultValue": "1000", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1537, + "kind": 32768, + "kindString": "Parameter", + "name": "max_pool_size", + "type": { + "name": "int", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Settings for creating new session instances. If None, default settings will\nbe used. Do not set it if you are providing a `create_session_function`." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1538, + "kind": 32768, + "kindString": "Parameter", + "name": "create_session_settings", + "type": { + "name": "dict | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "dict" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A callable to create new session instances. If None, a default session settings\nwill be used. Do not set it if you are providing `create_session_settings`." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1539, + "kind": 32768, + "kindString": "Parameter", + "name": "create_session_function", + "type": { + "name": "CreateSessionFunctionType | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "CreateSessionFunctionType", + "target": "1533" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The event manager to handle events like persist state." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1540, + "kind": 32768, + "kindString": "Parameter", + "name": "event_manager", + "type": { + "name": "EventManager | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "EventManager", + "target": "2086" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Flag to enable or disable state persistence of the pool." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1541, + "kind": 32768, + "kindString": "Parameter", + "name": "persistence_enabled", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The name of the `KeyValueStore` used for state persistence." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1542, + "kind": 32768, + "kindString": "Parameter", + "name": "persist_state_kvs_name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The key under which the session pool's state is stored in the `KeyValueStore`." + } + ] + }, + "defaultValue": "'CRAWLEE_SESSION_POOL_STATE'", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1543, + "kind": 32768, + "kindString": "Parameter", + "name": "persist_state_key", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get a string representation." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1544, + "module": "sessions._session_pool", + "name": "__repr__", + "parsedDocstring": { + "text": "Get a string representation." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/sessions/_session_pool.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 85 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get a string representation." + } + ] + }, + "flags": {}, + "id": 1545, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__repr__", + "parameters": [], + "type": { + "name": "str", + "type": "reference" + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the total number of sessions currently maintained in the pool." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 1546, + "module": "sessions._session_pool", + "name": "session_count", + "parsedDocstring": { + "text": "Get the total number of sessions currently maintained in the pool." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/sessions/_session_pool.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 90 + } + ], + "type": { + "name": "int", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the number of sessions that are currently usable." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 1547, + "module": "sessions._session_pool", + "name": "usable_session_count", + "parsedDocstring": { + "text": "Get the number of sessions that are currently usable." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/sessions/_session_pool.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 95 + } + ], + "type": { + "name": "int", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the number of sessions that are no longer usable." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 1548, + "module": "sessions._session_pool", + "name": "retired_session_count", + "parsedDocstring": { + "text": "Get the number of sessions that are no longer usable." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/sessions/_session_pool.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 100 + } + ], + "type": { + "name": "int", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Indicate whether the context is active." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 1549, + "module": "sessions._session_pool", + "name": "active", + "parsedDocstring": { + "text": "Indicate whether the context is active." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/sessions/_session_pool.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 105 + } + ], + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize the pool upon entering the context manager.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1550, + "module": "sessions._session_pool", + "name": "__aenter__", + "parsedDocstring": { + "text": "Initialize the pool upon entering the context manager.\n" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/sessions/_session_pool.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 109 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize the pool upon entering the context manager.\n" + } + ] + }, + "flags": {}, + "id": 1551, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "__aenter__", + "parameters": [], + "type": { + "name": "SessionPool", + "type": "reference", + "target": "1534" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Deinitialize the pool upon exiting the context manager.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1552, + "module": "sessions._session_pool", + "name": "__aexit__", + "parsedDocstring": { + "text": "Deinitialize the pool upon exiting the context manager.\n" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/sessions/_session_pool.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 129 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Deinitialize the pool upon exiting the context manager.\n" + } + ] + }, + "flags": {}, + "id": 1553, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "__aexit__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1554, + "kind": 32768, + "kindString": "Parameter", + "name": "exc_type", + "type": { + "name": "type[BaseException] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "type", + "typeArguments": [ + { + "type": "reference", + "name": "BaseException" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1555, + "kind": 32768, + "kindString": "Parameter", + "name": "exc_value", + "type": { + "name": "BaseException | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "BaseException" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1556, + "kind": 32768, + "kindString": "Parameter", + "name": "exc_traceback", + "type": { + "name": "TracebackType | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "TracebackType" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Retrieve the current state of the pool either as a model or as a dictionary." + } + ] + }, + "decorations": [ + { + "name": "ensure_context" + } + ], + "flags": {}, + "groups": [], + "id": 1557, + "module": "sessions._session_pool", + "name": "get_state", + "parsedDocstring": { + "text": "Retrieve the current state of the pool either as a model or as a dictionary." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/sessions/_session_pool.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 154 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Retrieve the current state of the pool either as a model or as a dictionary." + } + ] + }, + "flags": {}, + "id": 1558, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "get_state", + "parameters": [ + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1559, + "kind": 32768, + "kindString": "Parameter", + "name": "as_dict", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "SessionPoolModel | dict", + "type": "union", + "types": [ + { + "type": "reference", + "name": "SessionPoolModel", + "target": "1637" + }, + { + "type": "reference", + "name": "dict" + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Retrieve the current state of the pool either as a model or as a dictionary." + } + ] + }, + "flags": {}, + "id": 1570, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "get_state", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 1571, + "kind": 32768, + "kindString": "Parameter", + "name": "as_dict", + "type": { + "name": "Literal", + "type": "reference", + "typeArguments": [ + { + "type": "literal", + "value": true + } + ] + } + } + ], + "type": { + "name": "dict", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Retrieve the current state of the pool either as a model or as a dictionary." + } + ] + }, + "flags": {}, + "id": 1572, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "get_state", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 1573, + "kind": 32768, + "kindString": "Parameter", + "name": "as_dict", + "type": { + "name": "Literal", + "type": "reference", + "typeArguments": [ + { + "type": "literal", + "value": false + } + ] + } + } + ], + "type": { + "name": "SessionPoolModel", + "type": "reference", + "target": "1637" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Add an externally created session to the pool.\n\nThis is intended only for the cases when you want to add a session that was created outside of the pool.\nOtherwise, the pool will create new sessions automatically.\n" + } + ] + }, + "decorations": [ + { + "name": "ensure_context" + } + ], + "flags": {}, + "groups": [], + "id": 1560, + "module": "sessions._session_pool", + "name": "add_session", + "parsedDocstring": { + "text": "Add an externally created session to the pool.\n\nThis is intended only for the cases when you want to add a session that was created outside of the pool.\nOtherwise, the pool will create new sessions automatically.\n", + "args": { + "session": "The session to add to the pool." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/sessions/_session_pool.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 162 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Add an externally created session to the pool.\n\nThis is intended only for the cases when you want to add a session that was created outside of the pool.\nOtherwise, the pool will create new sessions automatically.\n" + } + ] + }, + "flags": {}, + "id": 1561, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "add_session", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The session to add to the pool." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1562, + "kind": 32768, + "kindString": "Parameter", + "name": "session", + "type": { + "name": "Session", + "type": "reference", + "target": "1575" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Retrieve a random session from the pool.\n\nThis method first ensures the session pool is at its maximum capacity. If the random session is not usable,\nretired sessions are removed and a new session is created and returned.\n" + } + ] + }, + "decorations": [ + { + "name": "ensure_context" + } + ], + "flags": {}, + "groups": [], + "id": 1563, + "module": "sessions._session_pool", + "name": "get_session", + "parsedDocstring": { + "text": "Retrieve a random session from the pool.\n\nThis method first ensures the session pool is at its maximum capacity. If the random session is not usable,\nretired sessions are removed and a new session is created and returned.\n", + "returns": "The session object." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/sessions/_session_pool.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 179 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "The session object." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Retrieve a random session from the pool.\n\nThis method first ensures the session pool is at its maximum capacity. If the random session is not usable,\nretired sessions are removed and a new session is created and returned.\n" + } + ] + }, + "flags": {}, + "id": 1564, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_session", + "parameters": [], + "type": { + "name": "Session", + "type": "reference", + "target": "1575" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Retrieve a session by ID from the pool.\n\nThis method first ensures the session pool is at its maximum capacity. It then tries to retrieve a specific\nsession by ID. If the session is not found or not usable, `None` is returned.\n" + } + ] + }, + "decorations": [ + { + "name": "ensure_context" + } + ], + "flags": {}, + "groups": [], + "id": 1565, + "module": "sessions._session_pool", + "name": "get_session_by_id", + "parsedDocstring": { + "text": "Retrieve a session by ID from the pool.\n\nThis method first ensures the session pool is at its maximum capacity. It then tries to retrieve a specific\nsession by ID. If the session is not found or not usable, `None` is returned.\n", + "args": { + "session_id": "The ID of the session to retrieve.\n" + }, + "returns": "The session object if found and usable, otherwise `None`." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/sessions/_session_pool.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 199 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "The session object if found and usable, otherwise `None`." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Retrieve a session by ID from the pool.\n\nThis method first ensures the session pool is at its maximum capacity. It then tries to retrieve a specific\nsession by ID. If the session is not found or not usable, `None` is returned.\n" + } + ] + }, + "flags": {}, + "id": 1566, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_session_by_id", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The ID of the session to retrieve.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1567, + "kind": 32768, + "kindString": "Parameter", + "name": "session_id", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "Session | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Session", + "target": "1575" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Reset the KVS where the pool state is persisted." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1568, + "module": "sessions._session_pool", + "name": "reset_store", + "parsedDocstring": { + "text": "Reset the KVS where the pool state is persisted." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/sessions/_session_pool.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 224 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Reset the KVS where the pool state is persisted." + } + ] + }, + "flags": {}, + "id": 1569, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "reset_store", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A pool of sessions that are managed, rotated, and persisted based on usage and age.\n\nIt ensures effective session management by maintaining a pool of sessions and rotating them based on\nusage count, expiration time, or custom rules. It provides methods to retrieve sessions, manage their\nlifecycle, and optionally persist the state to enable recovery." + } + ] + }, + "decorations": [ + { + "args": "('Classes')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 1550, + 1552, + 1535, + 1544, + 1560, + 1563, + 1565, + 1557, + 1568 + ], + "title": "Methods" + }, + { + "children": [ + 1549, + 1548, + 1546, + 1547 + ], + "title": "Properties" + } + ], + "id": 1534, + "module": "sessions._session_pool", + "name": "SessionPool", + "parsedDocstring": { + "text": "A pool of sessions that are managed, rotated, and persisted based on usage and age.\n\nIt ensures effective session management by maintaining a pool of sessions and rotating them based on\nusage count, expiration time, or custom rules. It provides methods to retrieve sessions, manage their\nlifecycle, and optionally persist the state to enable recovery." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/sessions/_session_pool.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 27 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1574, + "module": "sessions._session", + "name": "logger", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/sessions/_session.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 18 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1576, + "module": "sessions._session", + "name": "__init__", + "parsedDocstring": { + "text": "Initialize a new instance.\n", + "args": { + "id": "Unique identifier for the session, autogenerated if not provided.", + "max_age": "Time duration after which the session expires.", + "user_data": "Custom user data associated with the session.", + "max_error_score": "Threshold score beyond which the session is considered blocked.", + "error_score_decrement": "Value by which the error score is decremented on successful operations.", + "created_at": "Timestamp when the session was created, defaults to current UTC time if not provided.", + "usage_count": "Number of times the session has been used.", + "max_usage_count": "Maximum allowable uses of the session before it is considered expired.", + "error_score": "Current error score of the session.", + "cookies": "Cookies associated with the session.", + "blocked_status_codes": "HTTP status codes that indicate a session should be blocked." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/sessions/_session.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 34 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n" + } + ] + }, + "flags": {}, + "id": 1577, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Unique identifier for the session, autogenerated if not provided." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1578, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Time duration after which the session expires." + } + ] + }, + "defaultValue": "timedelta(minutes=50)", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1579, + "kind": 32768, + "kindString": "Parameter", + "name": "max_age", + "type": { + "name": "timedelta", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Custom user data associated with the session." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1580, + "kind": 32768, + "kindString": "Parameter", + "name": "user_data", + "type": { + "name": "dict | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "dict" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Threshold score beyond which the session is considered blocked." + } + ] + }, + "defaultValue": "3.0", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1581, + "kind": 32768, + "kindString": "Parameter", + "name": "max_error_score", + "type": { + "name": "float", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Value by which the error score is decremented on successful operations." + } + ] + }, + "defaultValue": "0.5", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1582, + "kind": 32768, + "kindString": "Parameter", + "name": "error_score_decrement", + "type": { + "name": "float", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Timestamp when the session was created, defaults to current UTC time if not provided." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1583, + "kind": 32768, + "kindString": "Parameter", + "name": "created_at", + "type": { + "name": "datetime | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "datetime" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Number of times the session has been used." + } + ] + }, + "defaultValue": "0", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1584, + "kind": 32768, + "kindString": "Parameter", + "name": "usage_count", + "type": { + "name": "int", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum allowable uses of the session before it is considered expired." + } + ] + }, + "defaultValue": "50", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1585, + "kind": 32768, + "kindString": "Parameter", + "name": "max_usage_count", + "type": { + "name": "int", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Current error score of the session." + } + ] + }, + "defaultValue": "0.0", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1586, + "kind": 32768, + "kindString": "Parameter", + "name": "error_score", + "type": { + "name": "float", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Cookies associated with the session." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1587, + "kind": 32768, + "kindString": "Parameter", + "name": "cookies", + "type": { + "name": "SessionCookies | CookieJar | dict[str, str] | list[CookieParam] | None", + "type": "union", + "types": [ + { + "type": "union", + "types": [ + { + "type": "union", + "types": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "SessionCookies", + "target": "1663" + }, + { + "type": "reference", + "name": "CookieJar" + } + ] + }, + { + "type": "reference", + "name": "dict", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "str" + } + ] + } + ] + }, + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "CookieParam", + "target": "1644" + } + ], + "target": "866" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP status codes that indicate a session should be blocked." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1588, + "kind": 32768, + "kindString": "Parameter", + "name": "blocked_status_codes", + "type": { + "name": "list | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "list", + "target": "866" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance from a `SessionModel`." + } + ] + }, + "decorations": [ + { + "name": "classmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1589, + "module": "sessions._session", + "name": "from_model", + "parsedDocstring": { + "text": "Initialize a new instance from a `SessionModel`." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/sessions/_session.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 77 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance from a `SessionModel`." + } + ] + }, + "flags": {}, + "id": 1590, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "from_model", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1591, + "kind": 32768, + "kindString": "Parameter", + "name": "model", + "type": { + "name": "SessionModel", + "type": "reference", + "target": "1624" + } + } + ], + "type": { + "name": "Session", + "type": "reference", + "target": "1575" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get a string representation." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1592, + "module": "sessions._session", + "name": "__repr__", + "parsedDocstring": { + "text": "Get a string representation." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/sessions/_session.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 82 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get a string representation." + } + ] + }, + "flags": {}, + "id": 1593, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__repr__", + "parameters": [], + "type": { + "name": "str", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Compare two sessions for equality." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1594, + "module": "sessions._session", + "name": "__eq__", + "parsedDocstring": { + "text": "Compare two sessions for equality." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/sessions/_session.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 86 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Compare two sessions for equality." + } + ] + }, + "flags": {}, + "id": 1595, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__eq__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1596, + "kind": 32768, + "kindString": "Parameter", + "name": "other", + "type": { + "name": "object", + "type": "reference" + } + } + ], + "type": { + "name": "bool", + "type": "reference" + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the session ID." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 1597, + "module": "sessions._session", + "name": "id", + "parsedDocstring": { + "text": "Get the session ID." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/sessions/_session.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 93 + } + ], + "type": { + "name": "str", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the user data." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 1598, + "module": "sessions._session", + "name": "user_data", + "parsedDocstring": { + "text": "Get the user data." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/sessions/_session.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 98 + } + ], + "type": { + "name": "dict", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the cookies." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 1599, + "module": "sessions._session", + "name": "cookies", + "parsedDocstring": { + "text": "Get the cookies." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/sessions/_session.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 103 + } + ], + "type": { + "name": "SessionCookies", + "type": "reference", + "target": "1663" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the current error score." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 1600, + "module": "sessions._session", + "name": "error_score", + "parsedDocstring": { + "text": "Get the current error score." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/sessions/_session.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 108 + } + ], + "type": { + "name": "float", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the current usage count." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 1601, + "module": "sessions._session", + "name": "usage_count", + "parsedDocstring": { + "text": "Get the current usage count." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/sessions/_session.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 113 + } + ], + "type": { + "name": "float", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the expiration datetime of the session." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 1602, + "module": "sessions._session", + "name": "expires_at", + "parsedDocstring": { + "text": "Get the expiration datetime of the session." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/sessions/_session.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 118 + } + ], + "type": { + "name": "datetime", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Indicate whether the session is blocked based on the error score.." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 1603, + "module": "sessions._session", + "name": "is_blocked", + "parsedDocstring": { + "text": "Indicate whether the session is blocked based on the error score.." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/sessions/_session.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 123 + } + ], + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Indicate whether the session is expired based on the current time." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 1604, + "module": "sessions._session", + "name": "is_expired", + "parsedDocstring": { + "text": "Indicate whether the session is expired based on the current time." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/sessions/_session.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 128 + } + ], + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Indicate whether the session has reached its maximum usage limit." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 1605, + "module": "sessions._session", + "name": "is_max_usage_count_reached", + "parsedDocstring": { + "text": "Indicate whether the session has reached its maximum usage limit." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/sessions/_session.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 133 + } + ], + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Determine if the session is usable for next requests." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 1606, + "module": "sessions._session", + "name": "is_usable", + "parsedDocstring": { + "text": "Determine if the session is usable for next requests." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/sessions/_session.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 138 + } + ], + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Retrieve the current state of the session either as a model or as a dictionary." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1607, + "module": "sessions._session", + "name": "get_state", + "parsedDocstring": { + "text": "Retrieve the current state of the session either as a model or as a dictionary." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/sessions/_session.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 148 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Retrieve the current state of the session either as a model or as a dictionary." + } + ] + }, + "flags": {}, + "id": 1608, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "get_state", + "parameters": [ + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1609, + "kind": 32768, + "kindString": "Parameter", + "name": "as_dict", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "SessionModel | dict", + "type": "union", + "types": [ + { + "type": "reference", + "name": "SessionModel", + "target": "1624" + }, + { + "type": "reference", + "name": "dict" + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Retrieve the current state of the session either as a model or as a dictionary." + } + ] + }, + "flags": {}, + "id": 1620, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "get_state", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 1621, + "kind": 32768, + "kindString": "Parameter", + "name": "as_dict", + "type": { + "name": "Literal", + "type": "reference", + "typeArguments": [ + { + "type": "literal", + "value": true + } + ] + } + } + ], + "type": { + "name": "dict", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Retrieve the current state of the session either as a model or as a dictionary." + } + ] + }, + "flags": {}, + "id": 1622, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "get_state", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 1623, + "kind": 32768, + "kindString": "Parameter", + "name": "as_dict", + "type": { + "name": "Literal", + "type": "reference", + "typeArguments": [ + { + "type": "literal", + "value": false + } + ] + } + } + ], + "type": { + "name": "SessionModel", + "type": "reference", + "target": "1624" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Mark the session as good. Should be called after a successful session usage." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1610, + "module": "sessions._session", + "name": "mark_good", + "parsedDocstring": { + "text": "Mark the session as good. Should be called after a successful session usage." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/sessions/_session.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 169 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Mark the session as good. Should be called after a successful session usage." + } + ] + }, + "flags": {}, + "id": 1611, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "mark_good", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Mark the session as bad after an unsuccessful session usage." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1612, + "module": "sessions._session", + "name": "mark_bad", + "parsedDocstring": { + "text": "Mark the session as bad after an unsuccessful session usage." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/sessions/_session.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 180 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Mark the session as bad after an unsuccessful session usage." + } + ] + }, + "flags": {}, + "id": 1613, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "mark_bad", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Retire the session by setting the error score to the maximum value.\n\nThis method should be used if the session usage was unsuccessful and you are sure that it is because of\nthe session configuration and not any external matters. For example when server returns 403 status code.\nIf the session does not work due to some external factors as server error such as 5XX you probably want\nto use `mark_bad` method." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1614, + "module": "sessions._session", + "name": "retire", + "parsedDocstring": { + "text": "Retire the session by setting the error score to the maximum value.\n\nThis method should be used if the session usage was unsuccessful and you are sure that it is because of\nthe session configuration and not any external matters. For example when server returns 403 status code.\nIf the session does not work due to some external factors as server error such as 5XX you probably want\nto use `mark_bad` method." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/sessions/_session.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 189 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Retire the session by setting the error score to the maximum value.\n\nThis method should be used if the session usage was unsuccessful and you are sure that it is because of\nthe session configuration and not any external matters. For example when server returns 403 status code.\nIf the session does not work due to some external factors as server error such as 5XX you probably want\nto use `mark_bad` method." + } + ] + }, + "flags": {}, + "id": 1615, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "retire", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Evaluate whether a session should be retired based on the received HTTP status code.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1616, + "module": "sessions._session", + "name": "is_blocked_status_code", + "parsedDocstring": { + "text": "Evaluate whether a session should be retired based on the received HTTP status code.\n", + "args": { + "status_code": "The HTTP status code received from a server response.", + "ignore_http_error_status_codes": "Optional status codes to allow suppression of\ncodes from `blocked_status_codes`.\n" + }, + "returns": "True if the session should be retired, False otherwise." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/sessions/_session.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 201 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "True if the session should be retired, False otherwise." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Evaluate whether a session should be retired based on the received HTTP status code.\n" + } + ] + }, + "flags": {}, + "id": 1617, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "is_blocked_status_code", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The HTTP status code received from a server response." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 1618, + "kind": 32768, + "kindString": "Parameter", + "name": "status_code", + "type": { + "name": "int", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Optional status codes to allow suppression of\ncodes from `blocked_status_codes`.\n" + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1619, + "kind": 32768, + "kindString": "Parameter", + "name": "ignore_http_error_status_codes", + "type": { + "name": "set[int] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "set", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ], + "target": "1668" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "bool", + "type": "reference" + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Represent a single user session, managing cookies, error states, and usage limits.\n\nA `Session` simulates a specific user with attributes like cookies, IP (via proxy), and potentially\na unique browser fingerprint. It maintains its internal state, which can include custom user data\n(e.g., authorization tokens or headers) and tracks its usability through metrics such as error score,\nusage count, and expiration." + } + ] + }, + "decorations": [ + { + "args": "('Data structures')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 1594, + 1576, + 1592, + 1589, + 1607, + 1616, + 1612, + 1610, + 1614 + ], + "title": "Methods" + }, + { + "children": [ + 1599, + 1600, + 1602, + 1597, + 1603, + 1604, + 1605, + 1606, + 1601, + 1598 + ], + "title": "Properties" + } + ], + "id": 1575, + "module": "sessions._session", + "name": "Session", + "parsedDocstring": { + "text": "Represent a single user session, managing cookies, error states, and usage limits.\n\nA `Session` simulates a specific user with attributes like cookies, IP (via proxy), and potentially\na unique browser fingerprint. It maintains its internal state, which can include custom user data\n(e.g., authorization tokens or headers) and tracks its usability through metrics such as error score,\nusage count, and expiration." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/sessions/_session.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 22 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1625, + "module": "sessions._models", + "name": "model_config", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/sessions/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 23 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1626, + "module": "sessions._models", + "name": "id", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/sessions/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 25 + } + ], + "type": { + "name": "str", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1627, + "module": "sessions._models", + "name": "max_age", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/sessions/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 26 + } + ], + "type": { + "name": "timedelta", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1628, + "module": "sessions._models", + "name": "user_data", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/sessions/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 27 + } + ], + "type": { + "name": "dict", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1629, + "module": "sessions._models", + "name": "max_error_score", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/sessions/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 28 + } + ], + "type": { + "name": "float", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1630, + "module": "sessions._models", + "name": "error_score_decrement", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/sessions/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 29 + } + ], + "type": { + "name": "float", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1631, + "module": "sessions._models", + "name": "created_at", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/sessions/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 30 + } + ], + "type": { + "name": "datetime", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1632, + "module": "sessions._models", + "name": "usage_count", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/sessions/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 31 + } + ], + "type": { + "name": "int", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1633, + "module": "sessions._models", + "name": "max_usage_count", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/sessions/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 32 + } + ], + "type": { + "name": "int", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1634, + "module": "sessions._models", + "name": "error_score", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/sessions/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 33 + } + ], + "type": { + "name": "float", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1635, + "module": "sessions._models", + "name": "cookies", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/sessions/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 34 + } + ], + "type": { + "name": "list", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "CookieParam", + "target": "1644" + } + ], + "target": "866" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1636, + "module": "sessions._models", + "name": "blocked_status_codes", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/sessions/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 35 + } + ], + "type": { + "name": "list", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ], + "target": "866" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Model for a Session object." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 1636, + 1635, + 1631, + 1634, + 1630, + 1626, + 1627, + 1629, + 1633, + 1625, + 1632, + 1628 + ], + "title": "Properties" + } + ], + "id": 1624, + "module": "sessions._models", + "name": "SessionModel", + "parsedDocstring": { + "text": "Model for a Session object." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/sessions/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 20 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1638, + "module": "sessions._models", + "name": "model_config", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/sessions/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 41 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1639, + "module": "sessions._models", + "name": "max_pool_size", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/sessions/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 43 + } + ], + "type": { + "name": "int", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1640, + "module": "sessions._models", + "name": "sessions", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/sessions/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 45 + } + ], + "type": { + "name": "dict", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Session", + "target": "1575" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the total number of sessions currently maintained in the pool." + } + ] + }, + "decorations": [ + { + "args": "(alias='sessionCount')", + "name": "computed_field" + }, + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 1641, + "module": "sessions._models", + "name": "session_count", + "parsedDocstring": { + "text": "Get the total number of sessions currently maintained in the pool." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/sessions/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 68 + } + ], + "type": { + "name": "int", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the number of sessions that are currently usable." + } + ] + }, + "decorations": [ + { + "args": "(alias='usableSessionCount')", + "name": "computed_field" + }, + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 1642, + "module": "sessions._models", + "name": "usable_session_count", + "parsedDocstring": { + "text": "Get the number of sessions that are currently usable." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/sessions/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 74 + } + ], + "type": { + "name": "int", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the number of sessions that are no longer usable." + } + ] + }, + "decorations": [ + { + "args": "(alias='retiredSessionCount')", + "name": "computed_field" + }, + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 1643, + "module": "sessions._models", + "name": "retired_session_count", + "parsedDocstring": { + "text": "Get the number of sessions that are no longer usable." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/sessions/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 80 + } + ], + "type": { + "name": "int", + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Model for a SessionPool object." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 1639, + 1638, + 1643, + 1641, + 1640, + 1642 + ], + "title": "Properties" + } + ], + "id": 1637, + "module": "sessions._models", + "name": "SessionPoolModel", + "parsedDocstring": { + "text": "Model for a SessionPool object." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/sessions/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 38 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Cookie name." + } + ] + }, + "flags": {}, + "groups": [], + "id": 1645, + "module": "sessions._cookies", + "name": "name", + "parsedDocstring": { + "text": "Cookie name." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/sessions/_cookies.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 19 + } + ], + "type": { + "name": "Required", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Cookie value." + } + ] + }, + "flags": {}, + "groups": [], + "id": 1646, + "module": "sessions._cookies", + "name": "value", + "parsedDocstring": { + "text": "Cookie value." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/sessions/_cookies.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 22 + } + ], + "type": { + "name": "Required", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Domain for which the cookie is set." + } + ] + }, + "flags": {}, + "groups": [], + "id": 1647, + "module": "sessions._cookies", + "name": "domain", + "parsedDocstring": { + "text": "Domain for which the cookie is set." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/sessions/_cookies.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 25 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Path on the specified domain for which the cookie is set." + } + ] + }, + "flags": {}, + "groups": [], + "id": 1648, + "module": "sessions._cookies", + "name": "path", + "parsedDocstring": { + "text": "Path on the specified domain for which the cookie is set." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/sessions/_cookies.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 28 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Set the `Secure` flag for the cookie." + } + ] + }, + "flags": {}, + "groups": [], + "id": 1649, + "module": "sessions._cookies", + "name": "secure", + "parsedDocstring": { + "text": "Set the `Secure` flag for the cookie." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/sessions/_cookies.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 31 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Set the `HttpOnly` flag for the cookie." + } + ] + }, + "flags": {}, + "groups": [], + "id": 1650, + "module": "sessions._cookies", + "name": "http_only", + "parsedDocstring": { + "text": "Set the `HttpOnly` flag for the cookie." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/sessions/_cookies.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 34 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Expiration date for the cookie, None for a session cookie." + } + ] + }, + "flags": {}, + "groups": [], + "id": 1651, + "module": "sessions._cookies", + "name": "expires", + "parsedDocstring": { + "text": "Expiration date for the cookie, None for a session cookie." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/sessions/_cookies.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 37 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Set the `SameSite` attribute for the cookie." + } + ] + }, + "flags": {}, + "groups": [], + "id": 1652, + "module": "sessions._cookies", + "name": "same_site", + "parsedDocstring": { + "text": "Set the `SameSite` attribute for the cookie." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/sessions/_cookies.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 40 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Literal", + "typeArguments": [ + { + "type": "literal", + "value": "Lax" + }, + { + "type": "literal", + "value": "None" + }, + { + "type": "literal", + "value": "Strict" + } + ] + } + ] + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Dictionary representation of cookies for `SessionCookies.set` method." + } + ] + }, + "decorations": [ + { + "args": "('Data structures')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 1647, + 1651, + 1650, + 1645, + 1648, + 1652, + 1649, + 1646 + ], + "title": "Properties" + } + ], + "id": 1644, + "module": "sessions._cookies", + "name": "CookieParam", + "parsedDocstring": { + "text": "Dictionary representation of cookies for `SessionCookies.set` method." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/sessions/_cookies.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 16 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1654, + "module": "sessions._cookies", + "name": "name", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/sessions/_cookies.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 47 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1655, + "module": "sessions._cookies", + "name": "value", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/sessions/_cookies.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 48 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1656, + "module": "sessions._cookies", + "name": "domain", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/sessions/_cookies.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 49 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1657, + "module": "sessions._cookies", + "name": "path", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/sessions/_cookies.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 50 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1658, + "module": "sessions._cookies", + "name": "secure", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/sessions/_cookies.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 51 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1659, + "module": "sessions._cookies", + "name": "httpOnly", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/sessions/_cookies.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 52 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1660, + "module": "sessions._cookies", + "name": "expires", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/sessions/_cookies.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 53 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "float" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1661, + "module": "sessions._cookies", + "name": "sameSite", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/sessions/_cookies.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 54 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Literal", + "typeArguments": [ + { + "type": "literal", + "value": "Lax" + }, + { + "type": "literal", + "value": "None" + }, + { + "type": "literal", + "value": "Strict" + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1662, + "module": "sessions._cookies", + "name": "partitionKey", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/sessions/_cookies.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 55 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + ] + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Cookie parameters in Playwright format with camelCase naming." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 1656, + 1660, + 1659, + 1654, + 1662, + 1657, + 1661, + 1658, + 1655 + ], + "title": "Properties" + } + ], + "id": 1653, + "module": "sessions._cookies", + "name": "PlaywrightCookieParam", + "parsedDocstring": { + "text": "Cookie parameters in Playwright format with camelCase naming." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/sessions/_cookies.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 44 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1664, + "module": "sessions._cookies", + "name": "__init__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/sessions/_cookies.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 62 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 1665, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1666, + "kind": 32768, + "kindString": "Parameter", + "name": "cookies", + "type": { + "name": "SessionCookies | CookieJar | dict[str, str] | list[CookieParam] | None", + "type": "union", + "types": [ + { + "type": "union", + "types": [ + { + "type": "union", + "types": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "SessionCookies", + "target": "1663" + }, + { + "type": "reference", + "name": "CookieJar" + } + ] + }, + { + "type": "reference", + "name": "dict", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "str" + } + ] + } + ] + }, + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "CookieParam", + "target": "1644" + } + ], + "target": "866" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The cookie jar instance." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 1667, + "module": "sessions._cookies", + "name": "jar", + "parsedDocstring": { + "text": "The cookie jar instance." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/sessions/_cookies.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 82 + } + ], + "type": { + "name": "CookieJar", + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create and store a cookie with modern browser attributes.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1668, + "module": "sessions._cookies", + "name": "set", + "parsedDocstring": { + "text": "Create and store a cookie with modern browser attributes.\n", + "args": { + "name": "Cookie name.", + "value": "Cookie value.", + "domain": "Cookie domain.", + "path": "Cookie path.", + "expires": "Cookie expiration timestamp.", + "http_only": "Whether cookie is HTTP-only.", + "secure": "Whether cookie requires secure context.", + "same_site": "SameSite cookie attribute value." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/sessions/_cookies.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 86 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create and store a cookie with modern browser attributes.\n" + } + ] + }, + "flags": {}, + "id": 1669, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "set", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Cookie name." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1670, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Cookie value." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1671, + "kind": 32768, + "kindString": "Parameter", + "name": "value", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Cookie domain." + } + ] + }, + "defaultValue": "''", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1672, + "kind": 32768, + "kindString": "Parameter", + "name": "domain", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Cookie path." + } + ] + }, + "defaultValue": "'/'", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1673, + "kind": 32768, + "kindString": "Parameter", + "name": "path", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Cookie expiration timestamp." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1674, + "kind": 32768, + "kindString": "Parameter", + "name": "expires", + "type": { + "name": "int | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Whether cookie is HTTP-only." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1675, + "kind": 32768, + "kindString": "Parameter", + "name": "http_only", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Whether cookie requires secure context." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1676, + "kind": 32768, + "kindString": "Parameter", + "name": "secure", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "SameSite cookie attribute value." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1677, + "kind": 32768, + "kindString": "Parameter", + "name": "same_site", + "type": { + "name": "Literal['Lax', 'None', 'Strict'] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Literal", + "typeArguments": [ + { + "type": "literal", + "value": "Lax" + }, + { + "type": "literal", + "value": "None" + }, + { + "type": "literal", + "value": "Strict" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1678, + "kind": 32768, + "kindString": "Parameter", + "name": "_kwargs", + "type": { + "name": "Any", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Convert cookies to a list with `CookieParam` dicts." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1679, + "module": "sessions._cookies", + "name": "get_cookies_as_dicts", + "parsedDocstring": { + "text": "Convert cookies to a list with `CookieParam` dicts." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/sessions/_cookies.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 186 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Convert cookies to a list with `CookieParam` dicts." + } + ] + }, + "flags": {}, + "id": 1680, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "get_cookies_as_dicts", + "parameters": [], + "type": { + "name": "list", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "CookieParam", + "target": "1644" + } + ], + "target": "866" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Store a Cookie object in the session cookie jar.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1681, + "module": "sessions._cookies", + "name": "store_cookie", + "parsedDocstring": { + "text": "Store a Cookie object in the session cookie jar.\n", + "args": { + "cookie": "The Cookie object to store in the jar." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/sessions/_cookies.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 190 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Store a Cookie object in the session cookie jar.\n" + } + ] + }, + "flags": {}, + "id": 1682, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "store_cookie", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The Cookie object to store in the jar." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1683, + "kind": 32768, + "kindString": "Parameter", + "name": "cookie", + "type": { + "name": "Cookie", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Store multiple cookie objects in the session cookie jar.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1684, + "module": "sessions._cookies", + "name": "store_cookies", + "parsedDocstring": { + "text": "Store multiple cookie objects in the session cookie jar.\n", + "args": { + "cookies": "A list of cookie objects to store in the jar." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/sessions/_cookies.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 198 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Store multiple cookie objects in the session cookie jar.\n" + } + ] + }, + "flags": {}, + "id": 1685, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "store_cookies", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A list of cookie objects to store in the jar." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1686, + "kind": 32768, + "kindString": "Parameter", + "name": "cookies", + "type": { + "name": "list", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Cookie" + } + ], + "target": "866" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create and store cookies from their dictionary representations.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1687, + "module": "sessions._cookies", + "name": "set_cookies", + "parsedDocstring": { + "text": "Create and store cookies from their dictionary representations.\n", + "args": { + "cookie_dicts": "List of dictionaries where each dict represents cookie parameters." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/sessions/_cookies.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 208 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create and store cookies from their dictionary representations.\n" + } + ] + }, + "flags": {}, + "id": 1688, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "set_cookies", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "List of dictionaries where each dict represents cookie parameters." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1689, + "kind": 32768, + "kindString": "Parameter", + "name": "cookie_dicts", + "type": { + "name": "list", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "CookieParam", + "target": "1644" + } + ], + "target": "866" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get cookies in playwright format." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1690, + "module": "sessions._cookies", + "name": "get_cookies_as_playwright_format", + "parsedDocstring": { + "text": "Get cookies in playwright format." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/sessions/_cookies.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 218 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get cookies in playwright format." + } + ] + }, + "flags": {}, + "id": 1691, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "get_cookies_as_playwright_format", + "parameters": [], + "type": { + "name": "list", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "PlaywrightCookieParam", + "target": "1653" + } + ], + "target": "866" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Set cookies from playwright format." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1692, + "module": "sessions._cookies", + "name": "set_cookies_from_playwright_format", + "parsedDocstring": { + "text": "Set cookies from playwright format." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/sessions/_cookies.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 222 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Set cookies from playwright format." + } + ] + }, + "flags": {}, + "id": 1693, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "set_cookies_from_playwright_format", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1694, + "kind": 32768, + "kindString": "Parameter", + "name": "pw_cookies", + "type": { + "name": "list", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "PlaywrightCookieParam", + "target": "1653" + } + ], + "target": "866" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1695, + "module": "sessions._cookies", + "name": "__deepcopy__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/sessions/_cookies.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 229 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 1696, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__deepcopy__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1697, + "kind": 32768, + "kindString": "Parameter", + "name": "memo", + "type": { + "name": "dict[int, Any] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "dict", + "typeArguments": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "reference", + "name": "Any" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "SessionCookies", + "type": "reference", + "target": "1663" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1698, + "module": "sessions._cookies", + "name": "__len__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/sessions/_cookies.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 234 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 1699, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__len__", + "parameters": [], + "type": { + "name": "int", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1700, + "module": "sessions._cookies", + "name": "__setitem__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/sessions/_cookies.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 237 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 1701, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__setitem__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1702, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1703, + "kind": 32768, + "kindString": "Parameter", + "name": "value", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1704, + "module": "sessions._cookies", + "name": "__getitem__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/sessions/_cookies.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 240 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 1705, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__getitem__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1706, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1707, + "module": "sessions._cookies", + "name": "__iter__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/sessions/_cookies.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 246 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 1708, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__iter__", + "parameters": [], + "type": { + "name": "Iterator", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "CookieParam", + "target": "1644" + } + ] + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1709, + "module": "sessions._cookies", + "name": "__repr__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/sessions/_cookies.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 249 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 1710, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__repr__", + "parameters": [], + "type": { + "name": "str", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1711, + "module": "sessions._cookies", + "name": "__bool__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/sessions/_cookies.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 255 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 1712, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__bool__", + "parameters": [], + "type": { + "name": "bool", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1713, + "module": "sessions._cookies", + "name": "__eq__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/sessions/_cookies.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 260 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 1714, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__eq__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1715, + "kind": 32768, + "kindString": "Parameter", + "name": "other", + "type": { + "name": "object", + "type": "reference" + } + } + ], + "type": { + "name": "bool", + "type": "reference" + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Storage cookies for session with browser-compatible serialization and deserialization." + } + ] + }, + "decorations": [ + { + "args": "('Data structures')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 1711, + 1695, + 1713, + 1704, + 1664, + 1707, + 1698, + 1709, + 1700, + 1679, + 1690, + 1668, + 1687, + 1692, + 1681, + 1684 + ], + "title": "Methods" + }, + { + "children": [ + 1667 + ], + "title": "Properties" + } + ], + "id": 1663, + "module": "sessions._cookies", + "name": "SessionCookies", + "parsedDocstring": { + "text": "Storage cookies for session with browser-compatible serialization and deserialization." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/sessions/_cookies.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 59 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1716, + "module": "request_loaders._request_manager_tandem", + "name": "logger", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/request_loaders/_request_manager_tandem.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 20 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1718, + "module": "request_loaders._request_manager_tandem", + "name": "__init__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/request_loaders/_request_manager_tandem.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 31 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 1719, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1720, + "kind": 32768, + "kindString": "Parameter", + "name": "request_loader", + "type": { + "name": "RequestLoader", + "type": "reference", + "target": "1770" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1721, + "kind": 32768, + "kindString": "Parameter", + "name": "request_manager", + "type": { + "name": "RequestManager", + "type": "reference", + "target": "1752" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return an offline approximation of the total number of requests in the source (i.e. pending + handled)." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1722, + "module": "request_loaders._request_loader", + "name": "get_total_count", + "parsedDocstring": { + "text": "Return an offline approximation of the total number of requests in the source (i.e. pending + handled)." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/request_loaders/_request_manager_tandem.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 36 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return an offline approximation of the total number of requests in the source (i.e. pending + handled)." + } + ] + }, + "flags": {}, + "id": 1772, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_total_count", + "parameters": [], + "type": { + "name": "int", + "type": "reference" + }, + "overwrites": { + "name": "RequestManager.get_total_count", + "target": 3531, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RequestManager.get_total_count", + "target": 3531, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return True if there are no more requests in the source (there might still be unfinished requests)." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1724, + "module": "request_loaders._request_loader", + "name": "is_empty", + "parsedDocstring": { + "text": "Return True if there are no more requests in the source (there might still be unfinished requests)." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/request_loaders/_request_manager_tandem.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 40 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return True if there are no more requests in the source (there might still be unfinished requests)." + } + ] + }, + "flags": {}, + "id": 1774, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "is_empty", + "parameters": [], + "type": { + "name": "bool", + "type": "reference" + }, + "overwrites": { + "name": "RequestManager.is_empty", + "target": 3532, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RequestManager.is_empty", + "target": 3532, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return True if all requests have been handled." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1726, + "module": "request_loaders._request_loader", + "name": "is_finished", + "parsedDocstring": { + "text": "Return True if all requests have been handled." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/request_loaders/_request_manager_tandem.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 44 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return True if all requests have been handled." + } + ] + }, + "flags": {}, + "id": 1776, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "is_finished", + "parameters": [], + "type": { + "name": "bool", + "type": "reference" + }, + "overwrites": { + "name": "RequestManager.is_finished", + "target": 3533, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RequestManager.is_finished", + "target": 3533, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Add a single request to the manager and store it in underlying resource client.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1728, + "module": "request_loaders._request_manager", + "name": "add_request", + "parsedDocstring": { + "text": "Add a single request to the manager and store it in underlying resource client.\n", + "args": { + "request": "The request object (or its string representation) to be added to the manager.", + "forefront": "Determines whether the request should be added to the beginning (if True) or the end (if False)\nof the manager.\n" + }, + "returns": "Information about the request addition to the manager." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/request_loaders/_request_manager_tandem.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 48 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "Information about the request addition to the manager." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Add a single request to the manager and store it in underlying resource client.\n" + } + ] + }, + "flags": {}, + "id": 1756, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "add_request", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The request object (or its string representation) to be added to the manager." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1757, + "kind": 32768, + "kindString": "Parameter", + "name": "request", + "type": { + "name": "str | Request", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Request", + "target": "398" + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Determines whether the request should be added to the beginning (if True) or the end (if False)\nof the manager.\n" + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1758, + "kind": 32768, + "kindString": "Parameter", + "name": "forefront", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "ProcessedRequest", + "type": "reference", + "target": "828" + }, + "overwrites": { + "name": "RequestManager.add_request", + "target": 1755, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RequestManager.add_request", + "target": 1755, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Add requests to the manager in batches.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1732, + "module": "request_loaders._request_manager", + "name": "add_requests_batched", + "parsedDocstring": { + "text": "Add requests to the manager in batches.\n", + "args": { + "requests": "Requests to enqueue.", + "batch_size": "The number of requests to add in one batch.", + "wait_time_between_batches": "Time to wait between adding batches.", + "wait_for_all_requests_to_be_added": "If True, wait for all requests to be added before returning.", + "wait_for_all_requests_to_be_added_timeout": "Timeout for waiting for all requests to be added." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/request_loaders/_request_manager_tandem.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 52 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Add requests to the manager in batches.\n" + } + ] + }, + "flags": {}, + "id": 1760, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "add_requests_batched", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Requests to enqueue." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1761, + "kind": 32768, + "kindString": "Parameter", + "name": "requests", + "type": { + "name": "Sequence", + "type": "reference", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Request", + "target": "398" + } + ] + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The number of requests to add in one batch." + } + ] + }, + "defaultValue": "1000", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1762, + "kind": 32768, + "kindString": "Parameter", + "name": "batch_size", + "type": { + "name": "int", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Time to wait between adding batches." + } + ] + }, + "defaultValue": "timedelta(seconds=1)", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1763, + "kind": 32768, + "kindString": "Parameter", + "name": "wait_time_between_batches", + "type": { + "name": "timedelta", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, wait for all requests to be added before returning." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1764, + "kind": 32768, + "kindString": "Parameter", + "name": "wait_for_all_requests_to_be_added", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Timeout for waiting for all requests to be added." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1765, + "kind": 32768, + "kindString": "Parameter", + "name": "wait_for_all_requests_to_be_added_timeout", + "type": { + "name": "timedelta | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "timedelta" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "RequestManager.add_requests_batched", + "target": 1759, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RequestManager.add_requests_batched", + "target": 1759, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the next request to be processed, or `null` if there are no more pending requests." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1739, + "module": "request_loaders._request_loader", + "name": "fetch_next_request", + "parsedDocstring": { + "text": "Return the next request to be processed, or `null` if there are no more pending requests." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/request_loaders/_request_manager_tandem.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 70 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the next request to be processed, or `null` if there are no more pending requests." + } + ] + }, + "flags": {}, + "id": 1778, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "fetch_next_request", + "parameters": [], + "type": { + "name": "Request | None", + "type": "reference" + }, + "overwrites": { + "name": "RequestManager.fetch_next_request", + "target": 3534, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RequestManager.fetch_next_request", + "target": 3534, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Reclaims a failed request back to the source, so that it can be returned for processing later again.\n\nIt is possible to modify the request data by supplying an updated request as a parameter." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1741, + "module": "request_loaders._request_manager", + "name": "reclaim_request", + "parsedDocstring": { + "text": "Reclaims a failed request back to the source, so that it can be returned for processing later again.\n\nIt is possible to modify the request data by supplying an updated request as a parameter." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/request_loaders/_request_manager_tandem.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 93 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Reclaims a failed request back to the source, so that it can be returned for processing later again.\n\nIt is possible to modify the request data by supplying an updated request as a parameter." + } + ] + }, + "flags": {}, + "id": 1767, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "reclaim_request", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1768, + "kind": 32768, + "kindString": "Parameter", + "name": "request", + "type": { + "name": "Request", + "type": "reference", + "target": "398" + } + }, + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1769, + "kind": 32768, + "kindString": "Parameter", + "name": "forefront", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "ProcessedRequest | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ProcessedRequest", + "target": "828" + }, + { + "type": "literal", + "value": null + } + ] + }, + "overwrites": { + "name": "RequestManager.reclaim_request", + "target": 1766, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RequestManager.reclaim_request", + "target": 1766, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Mark a request as handled after a successful processing (or after giving up retrying)." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1745, + "module": "request_loaders._request_loader", + "name": "mark_request_as_handled", + "parsedDocstring": { + "text": "Mark a request as handled after a successful processing (or after giving up retrying)." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/request_loaders/_request_manager_tandem.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 97 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Mark a request as handled after a successful processing (or after giving up retrying)." + } + ] + }, + "flags": {}, + "id": 1780, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "mark_request_as_handled", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1781, + "kind": 32768, + "kindString": "Parameter", + "name": "request", + "type": { + "name": "Request", + "type": "reference", + "target": "398" + } + } + ], + "type": { + "name": "ProcessedRequest | None", + "type": "reference" + }, + "overwrites": { + "name": "RequestManager.mark_request_as_handled", + "target": 3535, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RequestManager.mark_request_as_handled", + "target": 3535, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the number of handled requests." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1748, + "module": "request_loaders._request_loader", + "name": "get_handled_count", + "parsedDocstring": { + "text": "Return the number of handled requests." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/request_loaders/_request_manager_tandem.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 101 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the number of handled requests." + } + ] + }, + "flags": {}, + "id": 1783, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_handled_count", + "parameters": [], + "type": { + "name": "int", + "type": "reference" + }, + "overwrites": { + "name": "RequestManager.get_handled_count", + "target": 3536, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RequestManager.get_handled_count", + "target": 3536, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Remove persistent state either from the Apify Cloud storage or from the local database." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1750, + "module": "request_loaders._request_manager", + "name": "drop", + "parsedDocstring": { + "text": "Remove persistent state either from the Apify Cloud storage or from the local database." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/request_loaders/_request_manager_tandem.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 105 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Remove persistent state either from the Apify Cloud storage or from the local database." + } + ] + }, + "flags": {}, + "id": 1754, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "drop", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "Storage.drop", + "target": 747, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RequestManager.drop", + "target": 1753, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Combine the loader with a request manager to support adding and reclaiming requests.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3585, + "module": "request_loaders._request_loader", + "name": "to_tandem", + "parsedDocstring": { + "text": "Combine the loader with a request manager to support adding and reclaiming requests.\n", + "args": { + "request_manager": "Request manager to combine the loader with.\nIf None is given, the default request queue is used." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/request_loaders/_request_loader.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 52 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Combine the loader with a request manager to support adding and reclaiming requests.\n" + } + ] + }, + "flags": {}, + "id": 1785, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "to_tandem", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Request manager to combine the loader with.\nIf None is given, the default request queue is used." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1786, + "kind": 32768, + "kindString": "Parameter", + "name": "request_manager", + "type": { + "name": "RequestManager | None", + "type": "reference" + } + } + ], + "type": { + "name": "RequestManagerTandem", + "type": "reference", + "target": "1717" + }, + "inheritedFrom": { + "name": "RequestLoader.to_tandem", + "target": 1784, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "RequestLoader.to_tandem", + "target": 1784, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Implements a tandem behaviour for a pair of `RequestLoader` and `RequestManager`.\n\nIn this scenario, the contents of the \"loader\" get transferred into the \"manager\", allowing processing the requests\nfrom both sources and also enqueueing new requests (not possible with plain `RequestManager`)." + } + ] + }, + "decorations": [ + { + "args": "('Classes')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 1718, + 1728, + 1732, + 1750, + 1739, + 1748, + 1722, + 1724, + 1726, + 1745, + 1741, + 3585 + ], + "title": "Methods" + } + ], + "id": 1717, + "module": "request_loaders._request_manager_tandem", + "name": "RequestManagerTandem", + "parsedDocstring": { + "text": "Implements a tandem behaviour for a pair of `RequestLoader` and `RequestManager`.\n\nIn this scenario, the contents of the \"loader\" get transferred into the \"manager\", allowing processing the requests\nfrom both sources and also enqueueing new requests (not possible with plain `RequestManager`)." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/request_loaders/_request_manager_tandem.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 24 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "RequestManager", + "target": "1752", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Remove persistent state either from the Apify Cloud storage or from the local database." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1753, + "module": "request_loaders._request_manager", + "name": "drop", + "parsedDocstring": { + "text": "Remove persistent state either from the Apify Cloud storage or from the local database." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/request_loaders/_request_manager.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 22 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Remove persistent state either from the Apify Cloud storage or from the local database." + } + ] + }, + "flags": {}, + "id": 1754, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "drop", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "Storage.drop", + "target": 747, + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Add a single request to the manager and store it in underlying resource client.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1755, + "module": "request_loaders._request_manager", + "name": "add_request", + "parsedDocstring": { + "text": "Add a single request to the manager and store it in underlying resource client.\n", + "args": { + "request": "The request object (or its string representation) to be added to the manager.", + "forefront": "Determines whether the request should be added to the beginning (if True) or the end (if False)\nof the manager.\n" + }, + "returns": "Information about the request addition to the manager." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/request_loaders/_request_manager.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 26 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "Information about the request addition to the manager." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Add a single request to the manager and store it in underlying resource client.\n" + } + ] + }, + "flags": {}, + "id": 1756, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "add_request", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The request object (or its string representation) to be added to the manager." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1757, + "kind": 32768, + "kindString": "Parameter", + "name": "request", + "type": { + "name": "str | Request", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Request", + "target": "398" + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Determines whether the request should be added to the beginning (if True) or the end (if False)\nof the manager.\n" + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1758, + "kind": 32768, + "kindString": "Parameter", + "name": "forefront", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "ProcessedRequest", + "type": "reference", + "target": "828" + }, + "overwrites": { + "name": "RequestManager.add_request", + "target": 1755, + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Add requests to the manager in batches.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1759, + "module": "request_loaders._request_manager", + "name": "add_requests_batched", + "parsedDocstring": { + "text": "Add requests to the manager in batches.\n", + "args": { + "requests": "Requests to enqueue.", + "batch_size": "The number of requests to add in one batch.", + "wait_time_between_batches": "Time to wait between adding batches.", + "wait_for_all_requests_to_be_added": "If True, wait for all requests to be added before returning.", + "wait_for_all_requests_to_be_added_timeout": "Timeout for waiting for all requests to be added." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/request_loaders/_request_manager.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 43 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Add requests to the manager in batches.\n" + } + ] + }, + "flags": {}, + "id": 1760, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "add_requests_batched", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Requests to enqueue." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1761, + "kind": 32768, + "kindString": "Parameter", + "name": "requests", + "type": { + "name": "Sequence", + "type": "reference", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Request", + "target": "398" + } + ] + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The number of requests to add in one batch." + } + ] + }, + "defaultValue": "1000", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1762, + "kind": 32768, + "kindString": "Parameter", + "name": "batch_size", + "type": { + "name": "int", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Time to wait between adding batches." + } + ] + }, + "defaultValue": "timedelta(seconds=1)", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1763, + "kind": 32768, + "kindString": "Parameter", + "name": "wait_time_between_batches", + "type": { + "name": "timedelta", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, wait for all requests to be added before returning." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1764, + "kind": 32768, + "kindString": "Parameter", + "name": "wait_for_all_requests_to_be_added", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Timeout for waiting for all requests to be added." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1765, + "kind": 32768, + "kindString": "Parameter", + "name": "wait_for_all_requests_to_be_added_timeout", + "type": { + "name": "timedelta | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "timedelta" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "RequestManager.add_requests_batched", + "target": 1759, + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Reclaims a failed request back to the source, so that it can be returned for processing later again.\n\nIt is possible to modify the request data by supplying an updated request as a parameter." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1766, + "module": "request_loaders._request_manager", + "name": "reclaim_request", + "parsedDocstring": { + "text": "Reclaims a failed request back to the source, so that it can be returned for processing later again.\n\nIt is possible to modify the request data by supplying an updated request as a parameter." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/request_loaders/_request_manager.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 66 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Reclaims a failed request back to the source, so that it can be returned for processing later again.\n\nIt is possible to modify the request data by supplying an updated request as a parameter." + } + ] + }, + "flags": {}, + "id": 1767, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "reclaim_request", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1768, + "kind": 32768, + "kindString": "Parameter", + "name": "request", + "type": { + "name": "Request", + "type": "reference", + "target": "398" + } + }, + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1769, + "kind": 32768, + "kindString": "Parameter", + "name": "forefront", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "ProcessedRequest | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ProcessedRequest", + "target": "828" + }, + { + "type": "literal", + "value": null + } + ] + }, + "overwrites": { + "name": "RequestManager.reclaim_request", + "target": 1766, + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return an offline approximation of the total number of requests in the source (i.e. pending + handled)." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3531, + "module": "request_loaders._request_loader", + "name": "get_total_count", + "parsedDocstring": { + "text": "Return an offline approximation of the total number of requests in the source (i.e. pending + handled)." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/request_loaders/_request_loader.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 29 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return an offline approximation of the total number of requests in the source (i.e. pending + handled)." + } + ] + }, + "flags": {}, + "id": 1772, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_total_count", + "parameters": [], + "type": { + "name": "int", + "type": "reference" + }, + "overwrites": { + "name": "RequestManager.get_total_count", + "target": 3531, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "RequestLoader.get_total_count", + "target": 1771, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return True if there are no more requests in the source (there might still be unfinished requests)." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3532, + "module": "request_loaders._request_loader", + "name": "is_empty", + "parsedDocstring": { + "text": "Return True if there are no more requests in the source (there might still be unfinished requests)." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/request_loaders/_request_loader.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 33 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return True if there are no more requests in the source (there might still be unfinished requests)." + } + ] + }, + "flags": {}, + "id": 1774, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "is_empty", + "parameters": [], + "type": { + "name": "bool", + "type": "reference" + }, + "overwrites": { + "name": "RequestManager.is_empty", + "target": 3532, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "RequestLoader.is_empty", + "target": 1773, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return True if all requests have been handled." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3533, + "module": "request_loaders._request_loader", + "name": "is_finished", + "parsedDocstring": { + "text": "Return True if all requests have been handled." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/request_loaders/_request_loader.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 37 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return True if all requests have been handled." + } + ] + }, + "flags": {}, + "id": 1776, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "is_finished", + "parameters": [], + "type": { + "name": "bool", + "type": "reference" + }, + "overwrites": { + "name": "RequestManager.is_finished", + "target": 3533, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "RequestLoader.is_finished", + "target": 1775, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the next request to be processed, or `null` if there are no more pending requests." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3534, + "module": "request_loaders._request_loader", + "name": "fetch_next_request", + "parsedDocstring": { + "text": "Return the next request to be processed, or `null` if there are no more pending requests." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/request_loaders/_request_loader.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 41 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the next request to be processed, or `null` if there are no more pending requests." + } + ] + }, + "flags": {}, + "id": 1778, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "fetch_next_request", + "parameters": [], + "type": { + "name": "Request | None", + "type": "reference" + }, + "overwrites": { + "name": "RequestManager.fetch_next_request", + "target": 3534, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "RequestLoader.fetch_next_request", + "target": 1777, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Mark a request as handled after a successful processing (or after giving up retrying)." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3535, + "module": "request_loaders._request_loader", + "name": "mark_request_as_handled", + "parsedDocstring": { + "text": "Mark a request as handled after a successful processing (or after giving up retrying)." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/request_loaders/_request_loader.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 45 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Mark a request as handled after a successful processing (or after giving up retrying)." + } + ] + }, + "flags": {}, + "id": 1780, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "mark_request_as_handled", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1781, + "kind": 32768, + "kindString": "Parameter", + "name": "request", + "type": { + "name": "Request", + "type": "reference", + "target": "398" + } + } + ], + "type": { + "name": "ProcessedRequest | None", + "type": "reference" + }, + "overwrites": { + "name": "RequestManager.mark_request_as_handled", + "target": 3535, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "RequestLoader.mark_request_as_handled", + "target": 1779, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the number of handled requests." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3536, + "module": "request_loaders._request_loader", + "name": "get_handled_count", + "parsedDocstring": { + "text": "Return the number of handled requests." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/request_loaders/_request_loader.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 49 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the number of handled requests." + } + ] + }, + "flags": {}, + "id": 1783, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_handled_count", + "parameters": [], + "type": { + "name": "int", + "type": "reference" + }, + "overwrites": { + "name": "RequestManager.get_handled_count", + "target": 3536, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "RequestLoader.get_handled_count", + "target": 1782, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Combine the loader with a request manager to support adding and reclaiming requests.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3537, + "module": "request_loaders._request_loader", + "name": "to_tandem", + "parsedDocstring": { + "text": "Combine the loader with a request manager to support adding and reclaiming requests.\n", + "args": { + "request_manager": "Request manager to combine the loader with.\nIf None is given, the default request queue is used." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/request_loaders/_request_loader.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 52 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Combine the loader with a request manager to support adding and reclaiming requests.\n" + } + ] + }, + "flags": {}, + "id": 1785, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "to_tandem", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Request manager to combine the loader with.\nIf None is given, the default request queue is used." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1786, + "kind": 32768, + "kindString": "Parameter", + "name": "request_manager", + "type": { + "name": "RequestManager | None", + "type": "reference" + } + } + ], + "type": { + "name": "RequestManagerTandem", + "type": "reference", + "target": "1717" + }, + "inheritedFrom": { + "name": "RequestLoader.to_tandem", + "target": 1784, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "RequestLoader.to_tandem", + "target": 1784, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Base class that extends `RequestLoader` with the capability to enqueue new requests and reclaim failed ones." + } + ] + }, + "decorations": [ + { + "args": "('Abstract classes')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 1755, + 1759, + 1753, + 3534, + 3536, + 3531, + 3532, + 3533, + 3535, + 1766, + 3537 + ], + "title": "Methods" + } + ], + "id": 1752, + "module": "request_loaders._request_manager", + "name": "RequestManager", + "parsedDocstring": { + "text": "Base class that extends `RequestLoader` with the capability to enqueue new requests and reclaim failed ones." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/request_loaders/_request_manager.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 18 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "RequestLoader", + "target": "1770", + "type": "reference" + } + ], + "extendedBy": [ + { + "name": "RequestQueue", + "target": "507", + "type": "reference" + }, + { + "name": "RequestManagerTandem", + "target": "1717", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return an offline approximation of the total number of requests in the source (i.e. pending + handled)." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1771, + "module": "request_loaders._request_loader", + "name": "get_total_count", + "parsedDocstring": { + "text": "Return an offline approximation of the total number of requests in the source (i.e. pending + handled)." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/request_loaders/_request_loader.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 29 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return an offline approximation of the total number of requests in the source (i.e. pending + handled)." + } + ] + }, + "flags": {}, + "id": 1772, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_total_count", + "parameters": [], + "type": { + "name": "int", + "type": "reference" + }, + "overwrites": { + "name": "RequestLoader.get_total_count", + "target": 1771, + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return True if there are no more requests in the source (there might still be unfinished requests)." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1773, + "module": "request_loaders._request_loader", + "name": "is_empty", + "parsedDocstring": { + "text": "Return True if there are no more requests in the source (there might still be unfinished requests)." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/request_loaders/_request_loader.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 33 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return True if there are no more requests in the source (there might still be unfinished requests)." + } + ] + }, + "flags": {}, + "id": 1774, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "is_empty", + "parameters": [], + "type": { + "name": "bool", + "type": "reference" + }, + "overwrites": { + "name": "RequestLoader.is_empty", + "target": 1773, + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return True if all requests have been handled." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1775, + "module": "request_loaders._request_loader", + "name": "is_finished", + "parsedDocstring": { + "text": "Return True if all requests have been handled." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/request_loaders/_request_loader.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 37 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return True if all requests have been handled." + } + ] + }, + "flags": {}, + "id": 1776, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "is_finished", + "parameters": [], + "type": { + "name": "bool", + "type": "reference" + }, + "overwrites": { + "name": "RequestLoader.is_finished", + "target": 1775, + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the next request to be processed, or `null` if there are no more pending requests." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1777, + "module": "request_loaders._request_loader", + "name": "fetch_next_request", + "parsedDocstring": { + "text": "Return the next request to be processed, or `null` if there are no more pending requests." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/request_loaders/_request_loader.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 41 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the next request to be processed, or `null` if there are no more pending requests." + } + ] + }, + "flags": {}, + "id": 1778, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "fetch_next_request", + "parameters": [], + "type": { + "name": "Request | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Request", + "target": "398" + }, + { + "type": "literal", + "value": null + } + ] + }, + "overwrites": { + "name": "RequestLoader.fetch_next_request", + "target": 1777, + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Mark a request as handled after a successful processing (or after giving up retrying)." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1779, + "module": "request_loaders._request_loader", + "name": "mark_request_as_handled", + "parsedDocstring": { + "text": "Mark a request as handled after a successful processing (or after giving up retrying)." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/request_loaders/_request_loader.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 45 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Mark a request as handled after a successful processing (or after giving up retrying)." + } + ] + }, + "flags": {}, + "id": 1780, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "mark_request_as_handled", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1781, + "kind": 32768, + "kindString": "Parameter", + "name": "request", + "type": { + "name": "Request", + "type": "reference", + "target": "398" + } + } + ], + "type": { + "name": "ProcessedRequest | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ProcessedRequest", + "target": "828" + }, + { + "type": "literal", + "value": null + } + ] + }, + "overwrites": { + "name": "RequestLoader.mark_request_as_handled", + "target": 1779, + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the number of handled requests." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1782, + "module": "request_loaders._request_loader", + "name": "get_handled_count", + "parsedDocstring": { + "text": "Return the number of handled requests." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/request_loaders/_request_loader.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 49 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the number of handled requests." + } + ] + }, + "flags": {}, + "id": 1783, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_handled_count", + "parameters": [], + "type": { + "name": "int", + "type": "reference" + }, + "overwrites": { + "name": "RequestLoader.get_handled_count", + "target": 1782, + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Combine the loader with a request manager to support adding and reclaiming requests.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1784, + "module": "request_loaders._request_loader", + "name": "to_tandem", + "parsedDocstring": { + "text": "Combine the loader with a request manager to support adding and reclaiming requests.\n", + "args": { + "request_manager": "Request manager to combine the loader with.\nIf None is given, the default request queue is used." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/request_loaders/_request_loader.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 52 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Combine the loader with a request manager to support adding and reclaiming requests.\n" + } + ] + }, + "flags": {}, + "id": 1785, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "to_tandem", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Request manager to combine the loader with.\nIf None is given, the default request queue is used." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1786, + "kind": 32768, + "kindString": "Parameter", + "name": "request_manager", + "type": { + "name": "RequestManager | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "RequestManager", + "target": "1752" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "RequestManagerTandem", + "type": "reference", + "target": "1717" + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "An abstract class defining the interface for classes that provide access to a read-only stream of requests.\n\nRequest loaders are used to manage and provide access to a storage of crawling requests.\n\nKey responsibilities:\n- Fetching the next request to be processed.\n- Marking requests as successfully handled after processing.\n- Managing state information such as the total and handled request counts." + } + ] + }, + "decorations": [ + { + "args": "('Abstract classes')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 1777, + 1782, + 1771, + 1773, + 1775, + 1779, + 1784 + ], + "title": "Methods" + } + ], + "id": 1770, + "module": "request_loaders._request_loader", + "name": "RequestLoader", + "parsedDocstring": { + "text": "An abstract class defining the interface for classes that provide access to a read-only stream of requests.\n\nRequest loaders are used to manage and provide access to a storage of crawling requests.\n\nKey responsibilities:\n- Fetching the next request to be processed.\n- Marking requests as successfully handled after processing.\n- Managing state information such as the total and handled request counts." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/request_loaders/_request_loader.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 17 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedBy": [ + { + "name": "RequestManager", + "target": "1752", + "type": "reference" + }, + { + "name": "RequestList", + "target": "1787", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1788, + "module": "request_loaders._request_list", + "name": "__init__", + "parsedDocstring": { + "text": "Initialize a new instance.\n", + "args": { + "requests": "The request objects (or their string representations) to be added to the provider.", + "name": "A name of the request list." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/request_loaders/_request_list.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 26 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n" + } + ] + }, + "flags": {}, + "id": 1789, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The request objects (or their string representations) to be added to the provider." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1790, + "kind": 32768, + "kindString": "Parameter", + "name": "requests", + "type": { + "name": "Iterable[str | Request] | AsyncIterable[str | Request] | None", + "type": "union", + "types": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "Iterable", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Request", + "target": "398" + } + ] + } + ] + }, + { + "type": "reference", + "name": "AsyncIterable", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Request", + "target": "398" + } + ] + } + ] + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A name of the request list." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1791, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 1792, + "module": "request_loaders._request_list", + "name": "name", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/request_loaders/_request_list.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 54 + } + ], + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return an offline approximation of the total number of requests in the source (i.e. pending + handled)." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1793, + "module": "request_loaders._request_loader", + "name": "get_total_count", + "parsedDocstring": { + "text": "Return an offline approximation of the total number of requests in the source (i.e. pending + handled)." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/request_loaders/_request_list.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 58 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return an offline approximation of the total number of requests in the source (i.e. pending + handled)." + } + ] + }, + "flags": {}, + "id": 1772, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_total_count", + "parameters": [], + "type": { + "name": "int", + "type": "reference" + }, + "overwrites": { + "name": "RequestLoader.get_total_count", + "target": 1771, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RequestLoader.get_total_count", + "target": 1771, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return True if there are no more requests in the source (there might still be unfinished requests)." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1795, + "module": "request_loaders._request_loader", + "name": "is_empty", + "parsedDocstring": { + "text": "Return True if there are no more requests in the source (there might still be unfinished requests)." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/request_loaders/_request_list.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 62 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return True if there are no more requests in the source (there might still be unfinished requests)." + } + ] + }, + "flags": {}, + "id": 1774, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "is_empty", + "parameters": [], + "type": { + "name": "bool", + "type": "reference" + }, + "overwrites": { + "name": "RequestLoader.is_empty", + "target": 1773, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RequestLoader.is_empty", + "target": 1773, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return True if all requests have been handled." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1797, + "module": "request_loaders._request_loader", + "name": "is_finished", + "parsedDocstring": { + "text": "Return True if all requests have been handled." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/request_loaders/_request_list.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 67 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return True if all requests have been handled." + } + ] + }, + "flags": {}, + "id": 1776, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "is_finished", + "parameters": [], + "type": { + "name": "bool", + "type": "reference" + }, + "overwrites": { + "name": "RequestLoader.is_finished", + "target": 1775, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RequestLoader.is_finished", + "target": 1775, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the next request to be processed, or `null` if there are no more pending requests." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1799, + "module": "request_loaders._request_loader", + "name": "fetch_next_request", + "parsedDocstring": { + "text": "Return the next request to be processed, or `null` if there are no more pending requests." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/request_loaders/_request_list.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 71 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the next request to be processed, or `null` if there are no more pending requests." + } + ] + }, + "flags": {}, + "id": 1778, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "fetch_next_request", + "parameters": [], + "type": { + "name": "Request | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Request", + "target": "398" + }, + { + "type": "literal", + "value": null + } + ] + }, + "overwrites": { + "name": "RequestLoader.fetch_next_request", + "target": 1777, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RequestLoader.fetch_next_request", + "target": 1777, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Mark a request as handled after a successful processing (or after giving up retrying)." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1801, + "module": "request_loaders._request_loader", + "name": "mark_request_as_handled", + "parsedDocstring": { + "text": "Mark a request as handled after a successful processing (or after giving up retrying)." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/request_loaders/_request_list.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 86 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Mark a request as handled after a successful processing (or after giving up retrying)." + } + ] + }, + "flags": {}, + "id": 1780, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "mark_request_as_handled", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1781, + "kind": 32768, + "kindString": "Parameter", + "name": "request", + "type": { + "name": "Request", + "type": "reference", + "target": "398" + } + } + ], + "type": { + "name": "ProcessedRequest | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ProcessedRequest", + "target": "828" + }, + { + "type": "literal", + "value": null + } + ] + }, + "overwrites": { + "name": "RequestLoader.mark_request_as_handled", + "target": 1779, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RequestLoader.mark_request_as_handled", + "target": 1779, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the number of handled requests." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1804, + "module": "request_loaders._request_loader", + "name": "get_handled_count", + "parsedDocstring": { + "text": "Return the number of handled requests." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/request_loaders/_request_list.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 91 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the number of handled requests." + } + ] + }, + "flags": {}, + "id": 1783, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_handled_count", + "parameters": [], + "type": { + "name": "int", + "type": "reference" + }, + "overwrites": { + "name": "RequestLoader.get_handled_count", + "target": 1782, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RequestLoader.get_handled_count", + "target": 1782, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Combine the loader with a request manager to support adding and reclaiming requests.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3538, + "module": "request_loaders._request_loader", + "name": "to_tandem", + "parsedDocstring": { + "text": "Combine the loader with a request manager to support adding and reclaiming requests.\n", + "args": { + "request_manager": "Request manager to combine the loader with.\nIf None is given, the default request queue is used." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/request_loaders/_request_loader.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 52 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Combine the loader with a request manager to support adding and reclaiming requests.\n" + } + ] + }, + "flags": {}, + "id": 1785, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "to_tandem", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Request manager to combine the loader with.\nIf None is given, the default request queue is used." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1786, + "kind": 32768, + "kindString": "Parameter", + "name": "request_manager", + "type": { + "name": "RequestManager | None", + "type": "reference" + } + } + ], + "type": { + "name": "RequestManagerTandem", + "type": "reference", + "target": "1717" + }, + "inheritedFrom": { + "name": "RequestLoader.to_tandem", + "target": 1784, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "RequestLoader.to_tandem", + "target": 1784, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Represents a (potentially very large) list of URLs to crawl.\n\nDisclaimer: The `RequestList` class is in its early version and is not fully implemented. It is currently\nintended mainly for testing purposes and small-scale projects. The current implementation is only in-memory\nstorage and is very limited. It will be (re)implemented in the future. For more details, see the GitHub issue:\nhttps://github.com/apify/crawlee-python/issues/99. For production usage we recommend to use the `RequestQueue`." + } + ] + }, + "decorations": [ + { + "args": "('Classes')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 1788, + 1799, + 1804, + 1793, + 1795, + 1797, + 1801, + 3538 + ], + "title": "Methods" + }, + { + "children": [ + 1792 + ], + "title": "Properties" + } + ], + "id": 1787, + "module": "request_loaders._request_list", + "name": "RequestList", + "parsedDocstring": { + "text": "Represents a (potentially very large) list of URLs to crawl.\n\nDisclaimer: The `RequestList` class is in its early version and is not fully implemented. It is currently\nintended mainly for testing purposes and small-scale projects. The current implementation is only in-memory\nstorage and is very limited. It will be (re)implemented in the future. For more details, see the GitHub issue:\nhttps://github.com/apify/crawlee-python/issues/99. For production usage we recommend to use the `RequestQueue`." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/request_loaders/_request_list.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 17 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "RequestLoader", + "target": "1770", + "type": "reference" + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1806, + "module": "http_clients._httpx", + "name": "logger", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/http_clients/_httpx.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 28 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1808, + "module": "http_clients._httpx", + "name": "__init__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/http_clients/_httpx.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 34 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 1809, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1810, + "kind": 32768, + "kindString": "Parameter", + "name": "response", + "type": { + "name": "httpx.Response", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 1811, + "module": "http_clients._httpx", + "name": "http_version", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/http_clients/_httpx.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 38 + } + ], + "type": { + "name": "str", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 1812, + "module": "http_clients._httpx", + "name": "status_code", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/http_clients/_httpx.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 42 + } + ], + "type": { + "name": "int", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 1813, + "module": "http_clients._httpx", + "name": "headers", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/http_clients/_httpx.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 46 + } + ], + "type": { + "name": "HttpHeaders", + "type": "reference", + "target": "123" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1814, + "module": "http_clients._httpx", + "name": "read", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/http_clients/_httpx.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 49 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 1815, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "read", + "parameters": [], + "type": { + "name": "bytes", + "type": "reference", + "target": "3048" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1816, + "module": "http_clients._httpx", + "name": "read_stream", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/http_clients/_httpx.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 54 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 1817, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "read_stream", + "parameters": [], + "type": { + "name": "AsyncIterator", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bytes", + "target": "3048" + } + ] + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Adapter class for `httpx.Response` to conform to the `HttpResponse` protocol." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 1808, + 1814, + 1816 + ], + "title": "Methods" + }, + { + "children": [ + 1813, + 1811, + 1812 + ], + "title": "Properties" + } + ], + "id": 1807, + "module": "http_clients._httpx", + "name": "_HttpxResponse", + "parsedDocstring": { + "text": "Adapter class for `httpx.Response` to conform to the `HttpResponse` protocol." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/http_clients/_httpx.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 31 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "name": "override" + } + ], + "flags": {}, + "groups": [], + "id": 1819, + "module": "http_clients._httpx", + "name": "handle_async_request", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/http_clients/_httpx.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 71 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 1820, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "handle_async_request", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1821, + "kind": 32768, + "kindString": "Parameter", + "name": "request", + "type": { + "name": "httpx.Request", + "type": "reference" + } + } + ], + "type": { + "name": "httpx.Response", + "type": "reference" + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP transport adapter that stores response cookies in a `Session`.\n\nThis transport adapter modifies the handling of HTTP requests to update the session cookies\nbased on the response cookies, ensuring that the cookies are stored in the session object\nrather than the `HTTPX` client itself." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 1819 + ], + "title": "Async Resource Clients" + } + ], + "id": 1818, + "module": "http_clients._httpx", + "name": "_HttpxTransport", + "parsedDocstring": { + "text": "HTTP transport adapter that stores response cookies in a `Session`.\n\nThis transport adapter modifies the handling of HTTP requests to update the session cookies\nbased on the response cookies, ensuring that the cookies are stored in the session object\nrather than the `HTTPX` client itself." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/http_clients/_httpx.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 62 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1823, + "module": "http_clients._httpx", + "name": "__init__", + "parsedDocstring": { + "text": "Initialize a new instance.\n", + "args": { + "persist_cookies_per_session": "Whether to persist cookies per HTTP session.", + "http1": "Whether to enable HTTP/1.1 support.", + "http2": "Whether to enable HTTP/2 support.", + "verify": "SSL certificates used to verify the identity of requested hosts.", + "header_generator": "Header generator instance to use for generating common headers.", + "async_client_kwargs": "Additional keyword arguments for `httpx.AsyncClient`." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/http_clients/_httpx.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 106 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n" + } + ] + }, + "flags": {}, + "id": 1824, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Whether to persist cookies per HTTP session." + } + ] + }, + "defaultValue": "True", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1825, + "kind": 32768, + "kindString": "Parameter", + "name": "persist_cookies_per_session", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Whether to enable HTTP/1.1 support." + } + ] + }, + "defaultValue": "True", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1826, + "kind": 32768, + "kindString": "Parameter", + "name": "http1", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Whether to enable HTTP/2 support." + } + ] + }, + "defaultValue": "True", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1827, + "kind": 32768, + "kindString": "Parameter", + "name": "http2", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "SSL certificates used to verify the identity of requested hosts." + } + ] + }, + "defaultValue": "True", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1828, + "kind": 32768, + "kindString": "Parameter", + "name": "verify", + "type": { + "name": "str | bool | SSLContext", + "type": "union", + "types": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "bool" + } + ] + }, + { + "type": "reference", + "name": "SSLContext" + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Header generator instance to use for generating common headers." + } + ] + }, + "defaultValue": "_DEFAULT_HEADER_GENERATOR", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1829, + "kind": 32768, + "kindString": "Parameter", + "name": "header_generator", + "type": { + "name": "HeaderGenerator | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "HeaderGenerator", + "target": "1974" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Additional keyword arguments for `httpx.AsyncClient`." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1830, + "kind": 32768, + "kindString": "Parameter", + "name": "async_client_kwargs", + "type": { + "name": "Any", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "HttpClient.__init__", + "target": 1920, + "type": "reference" + } + } + ], + "overwrites": { + "name": "HttpClient.__init__", + "target": 1920, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Perform the crawling for a given request.\n\nThis method is called from `crawler.run()`.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1831, + "module": "http_clients._base", + "name": "crawl", + "parsedDocstring": { + "text": "Perform the crawling for a given request.\n\nThis method is called from `crawler.run()`.\n", + "args": { + "request": "The request to be crawled.", + "session": "The session associated with the request.", + "proxy_info": "The information about the proxy to be used.", + "statistics": "The statistics object to register status codes.\n" + }, + "returns": "The result of the crawling." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/http_clients/_httpx.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 142 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "The result of the crawling." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Perform the crawling for a given request.\n\nThis method is called from `crawler.run()`.\n" + } + ] + }, + "flags": {}, + "id": 1925, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "crawl", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The request to be crawled." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1926, + "kind": 32768, + "kindString": "Parameter", + "name": "request", + "type": { + "name": "Request", + "type": "reference", + "target": "398" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The session associated with the request." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1927, + "kind": 32768, + "kindString": "Parameter", + "name": "session", + "type": { + "name": "Session | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Session", + "target": "1575" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The information about the proxy to be used." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1928, + "kind": 32768, + "kindString": "Parameter", + "name": "proxy_info", + "type": { + "name": "ProxyInfo | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ProxyInfo", + "target": "15" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The statistics object to register status codes.\n" + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1929, + "kind": 32768, + "kindString": "Parameter", + "name": "statistics", + "type": { + "name": "Statistics | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Statistics", + "target": "1403" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "HttpCrawlingResult", + "type": "reference", + "target": "1917" + }, + "overwrites": { + "name": "HttpClient.crawl", + "target": 1924, + "type": "reference" + } + } + ], + "overwrites": { + "name": "HttpClient.crawl", + "target": 1924, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Send an HTTP request via the client.\n\nThis method is called from `context.send_request()` helper.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1837, + "module": "http_clients._base", + "name": "send_request", + "parsedDocstring": { + "text": "Send an HTTP request via the client.\n\nThis method is called from `context.send_request()` helper.\n", + "args": { + "url": "The URL to send the request to.", + "method": "The HTTP method to use.", + "headers": "The headers to include in the request.", + "payload": "The data to be sent as the request body.", + "session": "The session associated with the request.", + "proxy_info": "The information about the proxy to be used.\n" + }, + "returns": "The HTTP response received from the server." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/http_clients/_httpx.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 179 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "The HTTP response received from the server." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Send an HTTP request via the client.\n\nThis method is called from `context.send_request()` helper.\n" + } + ] + }, + "flags": {}, + "id": 1931, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "send_request", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The URL to send the request to." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1932, + "kind": 32768, + "kindString": "Parameter", + "name": "url", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The HTTP method to use." + } + ] + }, + "defaultValue": "'GET'", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1933, + "kind": 32768, + "kindString": "Parameter", + "name": "method", + "type": { + "name": "Literal", + "type": "reference", + "typeArguments": [ + { + "type": "literal", + "value": "GET" + }, + { + "type": "literal", + "value": "HEAD" + }, + { + "type": "literal", + "value": "POST" + }, + { + "type": "literal", + "value": "PUT" + }, + { + "type": "literal", + "value": "DELETE" + }, + { + "type": "literal", + "value": "CONNECT" + }, + { + "type": "literal", + "value": "OPTIONS" + }, + { + "type": "literal", + "value": "TRACE" + }, + { + "type": "literal", + "value": "PATCH" + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The headers to include in the request." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1934, + "kind": 32768, + "kindString": "Parameter", + "name": "headers", + "type": { + "name": "HttpHeaders | dict[str, str] | None", + "type": "union", + "types": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "HttpHeaders", + "target": "123" + }, + { + "type": "reference", + "name": "dict", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "str" + } + ] + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The data to be sent as the request body." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1935, + "kind": 32768, + "kindString": "Parameter", + "name": "payload", + "type": { + "name": "HttpPayload | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "bytes", + "target": "3048" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The session associated with the request." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1936, + "kind": 32768, + "kindString": "Parameter", + "name": "session", + "type": { + "name": "Session | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Session", + "target": "1575" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The information about the proxy to be used.\n" + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1937, + "kind": 32768, + "kindString": "Parameter", + "name": "proxy_info", + "type": { + "name": "ProxyInfo | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ProxyInfo", + "target": "15" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "HttpResponse", + "type": "reference", + "target": "1909" + }, + "overwrites": { + "name": "HttpClient.send_request", + "target": 1930, + "type": "reference" + } + } + ], + "overwrites": { + "name": "HttpClient.send_request", + "target": 1930, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Stream an HTTP request via the client.\n\nThis method should be used for downloading potentially large data where you need to process\nthe response body in chunks rather than loading it entirely into memory.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1845, + "module": "http_clients._base", + "name": "stream", + "parsedDocstring": { + "text": "Stream an HTTP request via the client.\n\nThis method should be used for downloading potentially large data where you need to process\nthe response body in chunks rather than loading it entirely into memory.\n", + "args": { + "url": "The URL to send the request to.", + "method": "The HTTP method to use.", + "headers": "The headers to include in the request.", + "payload": "The data to be sent as the request body.", + "session": "The session associated with the request.", + "proxy_info": "The information about the proxy to be used.", + "timeout": "The maximum time to wait for establishing the connection.\n" + }, + "returns": "An async context manager yielding the HTTP response with streaming capabilities." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/http_clients/_httpx.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 211 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "An async context manager yielding the HTTP response with streaming capabilities." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Stream an HTTP request via the client.\n\nThis method should be used for downloading potentially large data where you need to process\nthe response body in chunks rather than loading it entirely into memory.\n" + } + ] + }, + "flags": {}, + "id": 1939, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "stream", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The URL to send the request to." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1940, + "kind": 32768, + "kindString": "Parameter", + "name": "url", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The HTTP method to use." + } + ] + }, + "defaultValue": "'GET'", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1941, + "kind": 32768, + "kindString": "Parameter", + "name": "method", + "type": { + "name": "Literal", + "type": "reference", + "typeArguments": [ + { + "type": "literal", + "value": "GET" + }, + { + "type": "literal", + "value": "HEAD" + }, + { + "type": "literal", + "value": "POST" + }, + { + "type": "literal", + "value": "PUT" + }, + { + "type": "literal", + "value": "DELETE" + }, + { + "type": "literal", + "value": "CONNECT" + }, + { + "type": "literal", + "value": "OPTIONS" + }, + { + "type": "literal", + "value": "TRACE" + }, + { + "type": "literal", + "value": "PATCH" + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The headers to include in the request." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1942, + "kind": 32768, + "kindString": "Parameter", + "name": "headers", + "type": { + "name": "HttpHeaders | dict[str, str] | None", + "type": "union", + "types": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "HttpHeaders", + "target": "123" + }, + { + "type": "reference", + "name": "dict", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "str" + } + ] + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The data to be sent as the request body." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1943, + "kind": 32768, + "kindString": "Parameter", + "name": "payload", + "type": { + "name": "HttpPayload | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "bytes", + "target": "3048" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The session associated with the request." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1944, + "kind": 32768, + "kindString": "Parameter", + "name": "session", + "type": { + "name": "Session | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Session", + "target": "1575" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The information about the proxy to be used." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1945, + "kind": 32768, + "kindString": "Parameter", + "name": "proxy_info", + "type": { + "name": "ProxyInfo | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ProxyInfo", + "target": "15" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The maximum time to wait for establishing the connection.\n" + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1946, + "kind": 32768, + "kindString": "Parameter", + "name": "timeout", + "type": { + "name": "timedelta | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "timedelta" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "AbstractAsyncContextManager", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "HttpResponse", + "target": "1909" + } + ] + }, + "overwrites": { + "name": "HttpClient.stream", + "target": 1938, + "type": "reference" + } + } + ], + "overwrites": { + "name": "HttpClient.stream", + "target": 1938, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Clean up resources used by the client.\n\nThis method is called when the client is no longer needed and should be overridden\nin subclasses to perform any necessary cleanup such as closing connections,\nreleasing file handles, or other resource deallocation." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1854, + "module": "http_clients._base", + "name": "cleanup", + "parsedDocstring": { + "text": "Clean up resources used by the client.\n\nThis method is called when the client is no longer needed and should be overridden\nin subclasses to perform any necessary cleanup such as closing connections,\nreleasing file handles, or other resource deallocation." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/http_clients/_httpx.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 339 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Clean up resources used by the client.\n\nThis method is called when the client is no longer needed and should be overridden\nin subclasses to perform any necessary cleanup such as closing connections,\nreleasing file handles, or other resource deallocation." + } + ] + }, + "flags": {}, + "id": 1948, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "cleanup", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "HttpClient.cleanup", + "target": 1947, + "type": "reference" + } + } + ], + "overwrites": { + "name": "HttpClient.cleanup", + "target": 1947, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Indicate whether the context is active." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 3522, + "module": "http_clients._base", + "name": "active", + "parsedDocstring": { + "text": "Indicate whether the context is active." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/http_clients/_base.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 95 + } + ], + "type": { + "name": "bool", + "type": "reference" + }, + "inheritedFrom": { + "name": "HttpClient.active", + "target": 1923, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize the client when entering the context manager.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3523, + "module": "http_clients._base", + "name": "__aenter__", + "parsedDocstring": { + "text": "Initialize the client when entering the context manager.\n" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/http_clients/_base.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 197 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize the client when entering the context manager.\n" + } + ] + }, + "flags": {}, + "id": 1950, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "__aenter__", + "parameters": [], + "type": { + "name": "HttpClient", + "type": "reference", + "target": "1919" + }, + "inheritedFrom": { + "name": "HttpClient.__aenter__", + "target": 1949, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "HttpClient.__aenter__", + "target": 1949, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Deinitialize the client and clean up resources when exiting the context manager.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3524, + "module": "http_clients._base", + "name": "__aexit__", + "parsedDocstring": { + "text": "Deinitialize the client and clean up resources when exiting the context manager.\n" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/http_clients/_base.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 209 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Deinitialize the client and clean up resources when exiting the context manager.\n" + } + ] + }, + "flags": {}, + "id": 1952, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "__aexit__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1953, + "kind": 32768, + "kindString": "Parameter", + "name": "exc_type", + "type": { + "name": "BaseException | None", + "type": "reference" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1954, + "kind": 32768, + "kindString": "Parameter", + "name": "exc_value", + "type": { + "name": "BaseException | None", + "type": "reference" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1955, + "kind": 32768, + "kindString": "Parameter", + "name": "traceback", + "type": { + "name": "TracebackType | None", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "HttpClient.__aexit__", + "target": 1951, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "HttpClient.__aexit__", + "target": 1951, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP client based on the `HTTPX` library.\n\nThis client uses the `HTTPX` library to perform HTTP requests in crawlers (`BasicCrawler` subclasses)\nand to manage sessions, proxies, and error handling.\n\nSee the `HttpClient` class for more common information about HTTP clients.\n\n### Usage\n\n```python\nfrom crawlee.crawlers import HttpCrawler # or any other HTTP client-based crawler\nfrom crawlee.http_clients import HttpxHttpClient\n\nhttp_client = HttpxHttpClient()\ncrawler = HttpCrawler(http_client=http_client)\n```" + } + ] + }, + "decorations": [ + { + "args": "('Classes')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 3523, + 3524, + 1823, + 1854, + 1831, + 1837, + 1845 + ], + "title": "Methods" + }, + { + "children": [ + 3522 + ], + "title": "Properties" + } + ], + "id": 1822, + "module": "http_clients._httpx", + "name": "HttpxHttpClient", + "parsedDocstring": { + "text": "HTTP client based on the `HTTPX` library.\n\nThis client uses the `HTTPX` library to perform HTTP requests in crawlers (`BasicCrawler` subclasses)\nand to manage sessions, proxies, and error handling.\n\nSee the `HttpClient` class for more common information about HTTP clients.\n\n### Usage\n\n```python\nfrom crawlee.crawlers import HttpCrawler # or any other HTTP client-based crawler\nfrom crawlee.http_clients import HttpxHttpClient\n\nhttp_client = HttpxHttpClient()\ncrawler = HttpCrawler(http_client=http_client)\n```" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/http_clients/_httpx.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 85 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "HttpClient", + "target": "1919", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "name": "override" + } + ], + "flags": {}, + "groups": [], + "id": 1857, + "module": "http_clients._curl_impersonate", + "name": "get_cookies_for_curl", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/http_clients/_curl_impersonate.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 40 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 1858, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "get_cookies_for_curl", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1859, + "kind": 32768, + "kindString": "Parameter", + "name": "request", + "type": { + "name": "CurlRequest", + "type": "reference" + } + } + ], + "type": { + "name": "list", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "CurlMorsel" + } + ], + "target": "866" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "name": "override" + } + ], + "flags": {}, + "groups": [], + "id": 1860, + "module": "http_clients._curl_impersonate", + "name": "update_cookies_from_curl", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/http_clients/_curl_impersonate.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 44 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 1861, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "update_cookies_from_curl", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1862, + "kind": 32768, + "kindString": "Parameter", + "name": "morsels", + "type": { + "name": "list", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "CurlMorsel" + } + ], + "target": "866" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 1857, + 1860 + ], + "title": "Methods" + } + ], + "id": 1856, + "module": "http_clients._curl_impersonate", + "name": "_EmptyCookies", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/http_clients/_curl_impersonate.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 38 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "name": "override" + } + ], + "flags": {}, + "groups": [], + "id": 1864, + "module": "http_clients._curl_impersonate", + "name": "__init__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/http_clients/_curl_impersonate.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 50 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 1865, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1866, + "kind": 32768, + "kindString": "Parameter", + "name": "args", + "type": { + "name": "Any", + "type": "reference" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1867, + "kind": 32768, + "kindString": "Parameter", + "name": "kwargs", + "type": { + "name": "Any", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 1864 + ], + "title": "Methods" + } + ], + "id": 1863, + "module": "http_clients._curl_impersonate", + "name": "_AsyncSession", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/http_clients/_curl_impersonate.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 48 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1869, + "module": "http_clients._curl_impersonate", + "name": "__init__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/http_clients/_curl_impersonate.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 58 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 1870, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1871, + "kind": 32768, + "kindString": "Parameter", + "name": "response", + "type": { + "name": "Response", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 1872, + "module": "http_clients._curl_impersonate", + "name": "http_version", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/http_clients/_curl_impersonate.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 62 + } + ], + "type": { + "name": "str", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 1873, + "module": "http_clients._curl_impersonate", + "name": "status_code", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/http_clients/_curl_impersonate.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 81 + } + ], + "type": { + "name": "int", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 1874, + "module": "http_clients._curl_impersonate", + "name": "headers", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/http_clients/_curl_impersonate.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 85 + } + ], + "type": { + "name": "HttpHeaders", + "type": "reference", + "target": "123" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1875, + "module": "http_clients._curl_impersonate", + "name": "read", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/http_clients/_curl_impersonate.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 88 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 1876, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "read", + "parameters": [], + "type": { + "name": "bytes", + "type": "reference", + "target": "3048" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1877, + "module": "http_clients._curl_impersonate", + "name": "read_stream", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/http_clients/_curl_impersonate.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 93 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 1878, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "read_stream", + "parameters": [], + "type": { + "name": "AsyncGenerator", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bytes", + "target": "3048" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Adapter class for `curl_cffi.requests.Response` to conform to the `HttpResponse` protocol." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 1869, + 1875, + 1877 + ], + "title": "Methods" + }, + { + "children": [ + 1874, + 1872, + 1873 + ], + "title": "Properties" + } + ], + "id": 1868, + "module": "http_clients._curl_impersonate", + "name": "_CurlImpersonateResponse", + "parsedDocstring": { + "text": "Adapter class for `curl_cffi.requests.Response` to conform to the `HttpResponse` protocol." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/http_clients/_curl_impersonate.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 55 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1880, + "module": "http_clients._curl_impersonate", + "name": "__init__", + "parsedDocstring": { + "text": "Initialize a new instance.\n", + "args": { + "persist_cookies_per_session": "Whether to persist cookies per HTTP session.", + "async_session_kwargs": "Additional keyword arguments for `curl_cffi.requests.AsyncSession`." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/http_clients/_curl_impersonate.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 123 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n" + } + ] + }, + "flags": {}, + "id": 1881, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Whether to persist cookies per HTTP session." + } + ] + }, + "defaultValue": "True", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1882, + "kind": 32768, + "kindString": "Parameter", + "name": "persist_cookies_per_session", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Additional keyword arguments for `curl_cffi.requests.AsyncSession`." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1883, + "kind": 32768, + "kindString": "Parameter", + "name": "async_session_kwargs", + "type": { + "name": "Any", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "HttpClient.__init__", + "target": 1920, + "type": "reference" + } + } + ], + "overwrites": { + "name": "HttpClient.__init__", + "target": 1920, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Perform the crawling for a given request.\n\nThis method is called from `crawler.run()`.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1884, + "module": "http_clients._base", + "name": "crawl", + "parsedDocstring": { + "text": "Perform the crawling for a given request.\n\nThis method is called from `crawler.run()`.\n", + "args": { + "request": "The request to be crawled.", + "session": "The session associated with the request.", + "proxy_info": "The information about the proxy to be used.", + "statistics": "The statistics object to register status codes.\n" + }, + "returns": "The result of the crawling." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/http_clients/_curl_impersonate.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 143 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "The result of the crawling." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Perform the crawling for a given request.\n\nThis method is called from `crawler.run()`.\n" + } + ] + }, + "flags": {}, + "id": 1925, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "crawl", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The request to be crawled." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1926, + "kind": 32768, + "kindString": "Parameter", + "name": "request", + "type": { + "name": "Request", + "type": "reference", + "target": "398" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The session associated with the request." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1927, + "kind": 32768, + "kindString": "Parameter", + "name": "session", + "type": { + "name": "Session | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Session", + "target": "1575" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The information about the proxy to be used." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1928, + "kind": 32768, + "kindString": "Parameter", + "name": "proxy_info", + "type": { + "name": "ProxyInfo | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ProxyInfo", + "target": "15" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The statistics object to register status codes.\n" + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1929, + "kind": 32768, + "kindString": "Parameter", + "name": "statistics", + "type": { + "name": "Statistics | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Statistics", + "target": "1403" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "HttpCrawlingResult", + "type": "reference", + "target": "1917" + }, + "overwrites": { + "name": "HttpClient.crawl", + "target": 1924, + "type": "reference" + } + } + ], + "overwrites": { + "name": "HttpClient.crawl", + "target": 1924, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Send an HTTP request via the client.\n\nThis method is called from `context.send_request()` helper.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1890, + "module": "http_clients._base", + "name": "send_request", + "parsedDocstring": { + "text": "Send an HTTP request via the client.\n\nThis method is called from `context.send_request()` helper.\n", + "args": { + "url": "The URL to send the request to.", + "method": "The HTTP method to use.", + "headers": "The headers to include in the request.", + "payload": "The data to be sent as the request body.", + "session": "The session associated with the request.", + "proxy_info": "The information about the proxy to be used.\n" + }, + "returns": "The HTTP response received from the server." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/http_clients/_curl_impersonate.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 180 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "The HTTP response received from the server." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Send an HTTP request via the client.\n\nThis method is called from `context.send_request()` helper.\n" + } + ] + }, + "flags": {}, + "id": 1931, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "send_request", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The URL to send the request to." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1932, + "kind": 32768, + "kindString": "Parameter", + "name": "url", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The HTTP method to use." + } + ] + }, + "defaultValue": "'GET'", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1933, + "kind": 32768, + "kindString": "Parameter", + "name": "method", + "type": { + "name": "Literal", + "type": "reference", + "typeArguments": [ + { + "type": "literal", + "value": "GET" + }, + { + "type": "literal", + "value": "HEAD" + }, + { + "type": "literal", + "value": "POST" + }, + { + "type": "literal", + "value": "PUT" + }, + { + "type": "literal", + "value": "DELETE" + }, + { + "type": "literal", + "value": "CONNECT" + }, + { + "type": "literal", + "value": "OPTIONS" + }, + { + "type": "literal", + "value": "TRACE" + }, + { + "type": "literal", + "value": "PATCH" + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The headers to include in the request." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1934, + "kind": 32768, + "kindString": "Parameter", + "name": "headers", + "type": { + "name": "HttpHeaders | dict[str, str] | None", + "type": "union", + "types": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "HttpHeaders", + "target": "123" + }, + { + "type": "reference", + "name": "dict", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "str" + } + ] + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The data to be sent as the request body." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1935, + "kind": 32768, + "kindString": "Parameter", + "name": "payload", + "type": { + "name": "HttpPayload | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "bytes", + "target": "3048" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The session associated with the request." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1936, + "kind": 32768, + "kindString": "Parameter", + "name": "session", + "type": { + "name": "Session | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Session", + "target": "1575" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The information about the proxy to be used.\n" + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1937, + "kind": 32768, + "kindString": "Parameter", + "name": "proxy_info", + "type": { + "name": "ProxyInfo | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ProxyInfo", + "target": "15" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "HttpResponse", + "type": "reference", + "target": "1909" + }, + "overwrites": { + "name": "HttpClient.send_request", + "target": 1930, + "type": "reference" + } + } + ], + "overwrites": { + "name": "HttpClient.send_request", + "target": 1930, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Stream an HTTP request via the client.\n\nThis method should be used for downloading potentially large data where you need to process\nthe response body in chunks rather than loading it entirely into memory.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1898, + "module": "http_clients._base", + "name": "stream", + "parsedDocstring": { + "text": "Stream an HTTP request via the client.\n\nThis method should be used for downloading potentially large data where you need to process\nthe response body in chunks rather than loading it entirely into memory.\n", + "args": { + "url": "The URL to send the request to.", + "method": "The HTTP method to use.", + "headers": "The headers to include in the request.", + "payload": "The data to be sent as the request body.", + "session": "The session associated with the request.", + "proxy_info": "The information about the proxy to be used.", + "timeout": "The maximum time to wait for establishing the connection.\n" + }, + "returns": "An async context manager yielding the HTTP response with streaming capabilities." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/http_clients/_curl_impersonate.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 217 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "An async context manager yielding the HTTP response with streaming capabilities." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Stream an HTTP request via the client.\n\nThis method should be used for downloading potentially large data where you need to process\nthe response body in chunks rather than loading it entirely into memory.\n" + } + ] + }, + "flags": {}, + "id": 1939, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "stream", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The URL to send the request to." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1940, + "kind": 32768, + "kindString": "Parameter", + "name": "url", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The HTTP method to use." + } + ] + }, + "defaultValue": "'GET'", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1941, + "kind": 32768, + "kindString": "Parameter", + "name": "method", + "type": { + "name": "Literal", + "type": "reference", + "typeArguments": [ + { + "type": "literal", + "value": "GET" + }, + { + "type": "literal", + "value": "HEAD" + }, + { + "type": "literal", + "value": "POST" + }, + { + "type": "literal", + "value": "PUT" + }, + { + "type": "literal", + "value": "DELETE" + }, + { + "type": "literal", + "value": "CONNECT" + }, + { + "type": "literal", + "value": "OPTIONS" + }, + { + "type": "literal", + "value": "TRACE" + }, + { + "type": "literal", + "value": "PATCH" + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The headers to include in the request." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1942, + "kind": 32768, + "kindString": "Parameter", + "name": "headers", + "type": { + "name": "HttpHeaders | dict[str, str] | None", + "type": "union", + "types": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "HttpHeaders", + "target": "123" + }, + { + "type": "reference", + "name": "dict", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "str" + } + ] + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The data to be sent as the request body." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1943, + "kind": 32768, + "kindString": "Parameter", + "name": "payload", + "type": { + "name": "HttpPayload | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "bytes", + "target": "3048" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The session associated with the request." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1944, + "kind": 32768, + "kindString": "Parameter", + "name": "session", + "type": { + "name": "Session | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Session", + "target": "1575" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The information about the proxy to be used." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1945, + "kind": 32768, + "kindString": "Parameter", + "name": "proxy_info", + "type": { + "name": "ProxyInfo | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ProxyInfo", + "target": "15" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The maximum time to wait for establishing the connection.\n" + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1946, + "kind": 32768, + "kindString": "Parameter", + "name": "timeout", + "type": { + "name": "timedelta | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "timedelta" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "AbstractAsyncContextManager", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "HttpResponse", + "target": "1909" + } + ] + }, + "overwrites": { + "name": "HttpClient.stream", + "target": 1938, + "type": "reference" + } + } + ], + "overwrites": { + "name": "HttpClient.stream", + "target": 1938, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Clean up resources used by the client.\n\nThis method is called when the client is no longer needed and should be overridden\nin subclasses to perform any necessary cleanup such as closing connections,\nreleasing file handles, or other resource deallocation." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1907, + "module": "http_clients._base", + "name": "cleanup", + "parsedDocstring": { + "text": "Clean up resources used by the client.\n\nThis method is called when the client is no longer needed and should be overridden\nin subclasses to perform any necessary cleanup such as closing connections,\nreleasing file handles, or other resource deallocation." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/http_clients/_curl_impersonate.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 306 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Clean up resources used by the client.\n\nThis method is called when the client is no longer needed and should be overridden\nin subclasses to perform any necessary cleanup such as closing connections,\nreleasing file handles, or other resource deallocation." + } + ] + }, + "flags": {}, + "id": 1948, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "cleanup", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "HttpClient.cleanup", + "target": 1947, + "type": "reference" + } + } + ], + "overwrites": { + "name": "HttpClient.cleanup", + "target": 1947, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Indicate whether the context is active." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 3525, + "module": "http_clients._base", + "name": "active", + "parsedDocstring": { + "text": "Indicate whether the context is active." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/http_clients/_base.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 95 + } + ], + "type": { + "name": "bool", + "type": "reference" + }, + "inheritedFrom": { + "name": "HttpClient.active", + "target": 1923, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize the client when entering the context manager.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3526, + "module": "http_clients._base", + "name": "__aenter__", + "parsedDocstring": { + "text": "Initialize the client when entering the context manager.\n" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/http_clients/_base.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 197 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize the client when entering the context manager.\n" + } + ] + }, + "flags": {}, + "id": 1950, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "__aenter__", + "parameters": [], + "type": { + "name": "HttpClient", + "type": "reference", + "target": "1919" + }, + "inheritedFrom": { + "name": "HttpClient.__aenter__", + "target": 1949, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "HttpClient.__aenter__", + "target": 1949, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Deinitialize the client and clean up resources when exiting the context manager.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3527, + "module": "http_clients._base", + "name": "__aexit__", + "parsedDocstring": { + "text": "Deinitialize the client and clean up resources when exiting the context manager.\n" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/http_clients/_base.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 209 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Deinitialize the client and clean up resources when exiting the context manager.\n" + } + ] + }, + "flags": {}, + "id": 1952, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "__aexit__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1953, + "kind": 32768, + "kindString": "Parameter", + "name": "exc_type", + "type": { + "name": "BaseException | None", + "type": "reference" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1954, + "kind": 32768, + "kindString": "Parameter", + "name": "exc_value", + "type": { + "name": "BaseException | None", + "type": "reference" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1955, + "kind": 32768, + "kindString": "Parameter", + "name": "traceback", + "type": { + "name": "TracebackType | None", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "HttpClient.__aexit__", + "target": 1951, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "HttpClient.__aexit__", + "target": 1951, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP client based on the `curl-cffi` library.\n\nThis client uses the `curl-cffi` library to perform HTTP requests in crawlers (`BasicCrawler` subclasses)\nand to manage sessions, proxies, and error handling.\n\nSee the `HttpClient` class for more common information about HTTP clients.\n\n### Usage\n\n```python\nfrom crawlee.crawlers import HttpCrawler # or any other HTTP client-based crawler\nfrom crawlee.http_clients import CurlImpersonateHttpClient\n\nhttp_client = CurlImpersonateHttpClient()\ncrawler = HttpCrawler(http_client=http_client)\n```" + } + ] + }, + "decorations": [ + { + "args": "('Classes')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 3526, + 3527, + 1880, + 1907, + 1884, + 1890, + 1898 + ], + "title": "Methods" + }, + { + "children": [ + 3525 + ], + "title": "Properties" + } + ], + "id": 1879, + "module": "http_clients._curl_impersonate", + "name": "CurlImpersonateHttpClient", + "parsedDocstring": { + "text": "HTTP client based on the `curl-cffi` library.\n\nThis client uses the `curl-cffi` library to perform HTTP requests in crawlers (`BasicCrawler` subclasses)\nand to manage sessions, proxies, and error handling.\n\nSee the `HttpClient` class for more common information about HTTP clients.\n\n### Usage\n\n```python\nfrom crawlee.crawlers import HttpCrawler # or any other HTTP client-based crawler\nfrom crawlee.http_clients import CurlImpersonateHttpClient\n\nhttp_client = CurlImpersonateHttpClient()\ncrawler = HttpCrawler(http_client=http_client)\n```" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/http_clients/_curl_impersonate.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 104 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "HttpClient", + "target": "1919", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The HTTP version used in the response." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 1910, + "module": "http_clients._base", + "name": "http_version", + "parsedDocstring": { + "text": "The HTTP version used in the response." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/http_clients/_base.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 27 + } + ], + "type": { + "name": "str", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The HTTP status code received from the server." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 1911, + "module": "http_clients._base", + "name": "status_code", + "parsedDocstring": { + "text": "The HTTP status code received from the server." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/http_clients/_base.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 31 + } + ], + "type": { + "name": "int", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The HTTP headers received in the response." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 1912, + "module": "http_clients._base", + "name": "headers", + "parsedDocstring": { + "text": "The HTTP headers received in the response." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/http_clients/_base.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 35 + } + ], + "type": { + "name": "HttpHeaders", + "type": "reference", + "target": "123" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Read the entire content of the response body.\n\nThis method loads the complete response body into memory at once. It should be used\nfor responses received from regular HTTP requests (via `send_request` or `crawl` methods).\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1913, + "module": "http_clients._base", + "name": "read", + "parsedDocstring": { + "text": "Read the entire content of the response body.\n\nThis method loads the complete response body into memory at once. It should be used\nfor responses received from regular HTTP requests (via `send_request` or `crawl` methods).\n" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/http_clients/_base.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 38 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Read the entire content of the response body.\n\nThis method loads the complete response body into memory at once. It should be used\nfor responses received from regular HTTP requests (via `send_request` or `crawl` methods).\n" + } + ] + }, + "flags": {}, + "id": 1914, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "read", + "parameters": [], + "type": { + "name": "bytes", + "type": "reference", + "target": "3048" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Iterate over the content of the response body in chunks.\n\nThis method should be used for responses received from the `stream` method to process\nlarge response bodies without loading them entirely into memory. It allows for efficient\nprocessing of potentially large data by yielding chunks sequentially.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1915, + "module": "http_clients._base", + "name": "read_stream", + "parsedDocstring": { + "text": "Iterate over the content of the response body in chunks.\n\nThis method should be used for responses received from the `stream` method to process\nlarge response bodies without loading them entirely into memory. It allows for efficient\nprocessing of potentially large data by yielding chunks sequentially.\n" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/http_clients/_base.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 48 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Iterate over the content of the response body in chunks.\n\nThis method should be used for responses received from the `stream` method to process\nlarge response bodies without loading them entirely into memory. It allows for efficient\nprocessing of potentially large data by yielding chunks sequentially.\n" + } + ] + }, + "flags": {}, + "id": 1916, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "read_stream", + "parameters": [], + "type": { + "name": "AsyncIterator", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bytes", + "target": "3048" + } + ] + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Define the interface that any HTTP response object must implement." + } + ] + }, + "decorations": [ + { + "args": "('Data structures')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 1913, + 1915 + ], + "title": "Methods" + }, + { + "children": [ + 1912, + 1910, + 1911 + ], + "title": "Properties" + } + ], + "id": 1909, + "module": "http_clients._base", + "name": "HttpResponse", + "parsedDocstring": { + "text": "Define the interface that any HTTP response object must implement." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/http_clients/_base.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 23 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The HTTP response received from the server." + } + ] + }, + "flags": {}, + "groups": [], + "id": 1918, + "module": "http_clients._base", + "name": "http_response", + "parsedDocstring": { + "text": "The HTTP response received from the server." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/http_clients/_base.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 70 + } + ], + "type": { + "name": "HttpResponse", + "type": "reference", + "target": "1909" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Result of an HTTP-only crawl.\n\nMainly for the purpose of composing specific crawling contexts (e.g. `BeautifulSoupCrawlingContext`,\n`ParselCrawlingContext`, ...)." + } + ] + }, + "decorations": [ + { + "args": "(frozen=True)", + "name": "dataclass" + }, + { + "args": "('Data structures')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 1918 + ], + "title": "Properties" + } + ], + "id": 1917, + "module": "http_clients._base", + "name": "HttpCrawlingResult", + "parsedDocstring": { + "text": "Result of an HTTP-only crawl.\n\nMainly for the purpose of composing specific crawling contexts (e.g. `BeautifulSoupCrawlingContext`,\n`ParselCrawlingContext`, ...)." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/http_clients/_base.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 63 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedBy": [ + { + "name": "HttpCrawlingContext", + "target": "2625", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1920, + "module": "http_clients._base", + "name": "__init__", + "parsedDocstring": { + "text": "Initialize a new instance.\n", + "args": { + "persist_cookies_per_session": "Whether to persist cookies per HTTP session." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/http_clients/_base.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 79 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n" + } + ] + }, + "flags": {}, + "id": 1921, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Whether to persist cookies per HTTP session." + } + ] + }, + "defaultValue": "True", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1922, + "kind": 32768, + "kindString": "Parameter", + "name": "persist_cookies_per_session", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Indicate whether the context is active." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 1923, + "module": "http_clients._base", + "name": "active", + "parsedDocstring": { + "text": "Indicate whether the context is active." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/http_clients/_base.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 95 + } + ], + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Perform the crawling for a given request.\n\nThis method is called from `crawler.run()`.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1924, + "module": "http_clients._base", + "name": "crawl", + "parsedDocstring": { + "text": "Perform the crawling for a given request.\n\nThis method is called from `crawler.run()`.\n", + "args": { + "request": "The request to be crawled.", + "session": "The session associated with the request.", + "proxy_info": "The information about the proxy to be used.", + "statistics": "The statistics object to register status codes.\n" + }, + "returns": "The result of the crawling." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/http_clients/_base.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 100 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "The result of the crawling." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Perform the crawling for a given request.\n\nThis method is called from `crawler.run()`.\n" + } + ] + }, + "flags": {}, + "id": 1925, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "crawl", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The request to be crawled." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1926, + "kind": 32768, + "kindString": "Parameter", + "name": "request", + "type": { + "name": "Request", + "type": "reference", + "target": "398" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The session associated with the request." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1927, + "kind": 32768, + "kindString": "Parameter", + "name": "session", + "type": { + "name": "Session | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Session", + "target": "1575" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The information about the proxy to be used." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1928, + "kind": 32768, + "kindString": "Parameter", + "name": "proxy_info", + "type": { + "name": "ProxyInfo | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ProxyInfo", + "target": "15" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The statistics object to register status codes.\n" + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1929, + "kind": 32768, + "kindString": "Parameter", + "name": "statistics", + "type": { + "name": "Statistics | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Statistics", + "target": "1403" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "HttpCrawlingResult", + "type": "reference", + "target": "1917" + }, + "overwrites": { + "name": "HttpClient.crawl", + "target": 1924, + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Send an HTTP request via the client.\n\nThis method is called from `context.send_request()` helper.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1930, + "module": "http_clients._base", + "name": "send_request", + "parsedDocstring": { + "text": "Send an HTTP request via the client.\n\nThis method is called from `context.send_request()` helper.\n", + "args": { + "url": "The URL to send the request to.", + "method": "The HTTP method to use.", + "headers": "The headers to include in the request.", + "payload": "The data to be sent as the request body.", + "session": "The session associated with the request.", + "proxy_info": "The information about the proxy to be used.\n" + }, + "returns": "The HTTP response received from the server." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/http_clients/_base.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 126 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "The HTTP response received from the server." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Send an HTTP request via the client.\n\nThis method is called from `context.send_request()` helper.\n" + } + ] + }, + "flags": {}, + "id": 1931, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "send_request", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The URL to send the request to." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1932, + "kind": 32768, + "kindString": "Parameter", + "name": "url", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The HTTP method to use." + } + ] + }, + "defaultValue": "'GET'", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1933, + "kind": 32768, + "kindString": "Parameter", + "name": "method", + "type": { + "name": "Literal", + "type": "reference", + "typeArguments": [ + { + "type": "literal", + "value": "GET" + }, + { + "type": "literal", + "value": "HEAD" + }, + { + "type": "literal", + "value": "POST" + }, + { + "type": "literal", + "value": "PUT" + }, + { + "type": "literal", + "value": "DELETE" + }, + { + "type": "literal", + "value": "CONNECT" + }, + { + "type": "literal", + "value": "OPTIONS" + }, + { + "type": "literal", + "value": "TRACE" + }, + { + "type": "literal", + "value": "PATCH" + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The headers to include in the request." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1934, + "kind": 32768, + "kindString": "Parameter", + "name": "headers", + "type": { + "name": "HttpHeaders | dict[str, str] | None", + "type": "union", + "types": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "HttpHeaders", + "target": "123" + }, + { + "type": "reference", + "name": "dict", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "str" + } + ] + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The data to be sent as the request body." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1935, + "kind": 32768, + "kindString": "Parameter", + "name": "payload", + "type": { + "name": "HttpPayload | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "bytes", + "target": "3048" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The session associated with the request." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1936, + "kind": 32768, + "kindString": "Parameter", + "name": "session", + "type": { + "name": "Session | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Session", + "target": "1575" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The information about the proxy to be used.\n" + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1937, + "kind": 32768, + "kindString": "Parameter", + "name": "proxy_info", + "type": { + "name": "ProxyInfo | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ProxyInfo", + "target": "15" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "HttpResponse", + "type": "reference", + "target": "1909" + }, + "overwrites": { + "name": "HttpClient.send_request", + "target": 1930, + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Stream an HTTP request via the client.\n\nThis method should be used for downloading potentially large data where you need to process\nthe response body in chunks rather than loading it entirely into memory.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1938, + "module": "http_clients._base", + "name": "stream", + "parsedDocstring": { + "text": "Stream an HTTP request via the client.\n\nThis method should be used for downloading potentially large data where you need to process\nthe response body in chunks rather than loading it entirely into memory.\n", + "args": { + "url": "The URL to send the request to.", + "method": "The HTTP method to use.", + "headers": "The headers to include in the request.", + "payload": "The data to be sent as the request body.", + "session": "The session associated with the request.", + "proxy_info": "The information about the proxy to be used.", + "timeout": "The maximum time to wait for establishing the connection.\n" + }, + "returns": "An async context manager yielding the HTTP response with streaming capabilities." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/http_clients/_base.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 156 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "An async context manager yielding the HTTP response with streaming capabilities." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Stream an HTTP request via the client.\n\nThis method should be used for downloading potentially large data where you need to process\nthe response body in chunks rather than loading it entirely into memory.\n" + } + ] + }, + "flags": {}, + "id": 1939, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "stream", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The URL to send the request to." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1940, + "kind": 32768, + "kindString": "Parameter", + "name": "url", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The HTTP method to use." + } + ] + }, + "defaultValue": "'GET'", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1941, + "kind": 32768, + "kindString": "Parameter", + "name": "method", + "type": { + "name": "Literal", + "type": "reference", + "typeArguments": [ + { + "type": "literal", + "value": "GET" + }, + { + "type": "literal", + "value": "HEAD" + }, + { + "type": "literal", + "value": "POST" + }, + { + "type": "literal", + "value": "PUT" + }, + { + "type": "literal", + "value": "DELETE" + }, + { + "type": "literal", + "value": "CONNECT" + }, + { + "type": "literal", + "value": "OPTIONS" + }, + { + "type": "literal", + "value": "TRACE" + }, + { + "type": "literal", + "value": "PATCH" + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The headers to include in the request." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1942, + "kind": 32768, + "kindString": "Parameter", + "name": "headers", + "type": { + "name": "HttpHeaders | dict[str, str] | None", + "type": "union", + "types": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "HttpHeaders", + "target": "123" + }, + { + "type": "reference", + "name": "dict", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "str" + } + ] + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The data to be sent as the request body." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1943, + "kind": 32768, + "kindString": "Parameter", + "name": "payload", + "type": { + "name": "HttpPayload | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "bytes", + "target": "3048" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The session associated with the request." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1944, + "kind": 32768, + "kindString": "Parameter", + "name": "session", + "type": { + "name": "Session | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Session", + "target": "1575" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The information about the proxy to be used." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1945, + "kind": 32768, + "kindString": "Parameter", + "name": "proxy_info", + "type": { + "name": "ProxyInfo | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ProxyInfo", + "target": "15" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The maximum time to wait for establishing the connection.\n" + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1946, + "kind": 32768, + "kindString": "Parameter", + "name": "timeout", + "type": { + "name": "timedelta | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "timedelta" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "AbstractAsyncContextManager", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "HttpResponse", + "target": "1909" + } + ] + }, + "overwrites": { + "name": "HttpClient.stream", + "target": 1938, + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Clean up resources used by the client.\n\nThis method is called when the client is no longer needed and should be overridden\nin subclasses to perform any necessary cleanup such as closing connections,\nreleasing file handles, or other resource deallocation." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1947, + "module": "http_clients._base", + "name": "cleanup", + "parsedDocstring": { + "text": "Clean up resources used by the client.\n\nThis method is called when the client is no longer needed and should be overridden\nin subclasses to perform any necessary cleanup such as closing connections,\nreleasing file handles, or other resource deallocation." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/http_clients/_base.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 189 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Clean up resources used by the client.\n\nThis method is called when the client is no longer needed and should be overridden\nin subclasses to perform any necessary cleanup such as closing connections,\nreleasing file handles, or other resource deallocation." + } + ] + }, + "flags": {}, + "id": 1948, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "cleanup", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "HttpClient.cleanup", + "target": 1947, + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize the client when entering the context manager.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1949, + "module": "http_clients._base", + "name": "__aenter__", + "parsedDocstring": { + "text": "Initialize the client when entering the context manager.\n" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/http_clients/_base.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 197 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize the client when entering the context manager.\n" + } + ] + }, + "flags": {}, + "id": 1950, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "__aenter__", + "parameters": [], + "type": { + "name": "HttpClient", + "type": "reference", + "target": "1919" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Deinitialize the client and clean up resources when exiting the context manager.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1951, + "module": "http_clients._base", + "name": "__aexit__", + "parsedDocstring": { + "text": "Deinitialize the client and clean up resources when exiting the context manager.\n" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/http_clients/_base.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 209 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Deinitialize the client and clean up resources when exiting the context manager.\n" + } + ] + }, + "flags": {}, + "id": 1952, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "__aexit__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1953, + "kind": 32768, + "kindString": "Parameter", + "name": "exc_type", + "type": { + "name": "BaseException | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "BaseException" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1954, + "kind": 32768, + "kindString": "Parameter", + "name": "exc_value", + "type": { + "name": "BaseException | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "BaseException" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1955, + "kind": 32768, + "kindString": "Parameter", + "name": "traceback", + "type": { + "name": "TracebackType | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "TracebackType" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "An abstract base class for HTTP clients used in crawlers (`BasicCrawler` subclasses)." + } + ] + }, + "decorations": [ + { + "args": "('Abstract classes')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 1949, + 1951, + 1920, + 1947, + 1924, + 1930, + 1938 + ], + "title": "Methods" + }, + { + "children": [ + 1923 + ], + "title": "Properties" + } + ], + "id": 1919, + "module": "http_clients._base", + "name": "HttpClient", + "parsedDocstring": { + "text": "An abstract base class for HTTP clients used in crawlers (`BasicCrawler` subclasses)." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/http_clients/_base.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 75 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedBy": [ + { + "name": "HttpxHttpClient", + "target": "1822", + "type": "reference" + }, + { + "name": "CurlImpersonateHttpClient", + "target": "1879", + "type": "reference" + }, + { + "name": "PlaywrightHttpClient", + "target": "2187", + "type": "reference" + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1956, + "module": "fingerprint_suite._types", + "name": "SupportedOperatingSystems", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/fingerprint_suite/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 9 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1957, + "module": "fingerprint_suite._types", + "name": "SupportedDevices", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/fingerprint_suite/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 10 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1958, + "module": "fingerprint_suite._types", + "name": "SupportedHttpVersion", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/fingerprint_suite/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 11 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1959, + "module": "fingerprint_suite._types", + "name": "SupportedBrowserType", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/fingerprint_suite/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 12 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Defines the screen constrains for the fingerprint generator." + } + ] + }, + "flags": {}, + "groups": [], + "id": 1961, + "module": "fingerprint_suite._types", + "name": "model_config", + "parsedDocstring": { + "text": "Defines the screen constrains for the fingerprint generator." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/fingerprint_suite/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 17 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Minimal screen width constraint for the fingerprint generator." + } + ] + }, + "flags": {}, + "groups": [], + "id": 1962, + "module": "fingerprint_suite._types", + "name": "min_width", + "parsedDocstring": { + "text": "Minimal screen width constraint for the fingerprint generator." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/fingerprint_suite/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 21 + } + ], + "type": { + "name": "Annotated[float | None, Field(alias='minWidth')]", + "type": "union", + "types": [ + { + "type": "reference", + "name": "float" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximal screen width constraint for the fingerprint generator." + } + ] + }, + "flags": {}, + "groups": [], + "id": 1963, + "module": "fingerprint_suite._types", + "name": "max_width", + "parsedDocstring": { + "text": "Maximal screen width constraint for the fingerprint generator." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/fingerprint_suite/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 24 + } + ], + "type": { + "name": "Annotated[float | None, Field(alias='maxWidth')]", + "type": "union", + "types": [ + { + "type": "reference", + "name": "float" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Minimal screen height constraint for the fingerprint generator." + } + ] + }, + "flags": {}, + "groups": [], + "id": 1964, + "module": "fingerprint_suite._types", + "name": "min_height", + "parsedDocstring": { + "text": "Minimal screen height constraint for the fingerprint generator." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/fingerprint_suite/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 27 + } + ], + "type": { + "name": "Annotated[float | None, Field(alias='minHeight')]", + "type": "union", + "types": [ + { + "type": "reference", + "name": "float" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximal screen height constraint for the fingerprint generator." + } + ] + }, + "flags": {}, + "groups": [], + "id": 1965, + "module": "fingerprint_suite._types", + "name": "max_height", + "parsedDocstring": { + "text": "Maximal screen height constraint for the fingerprint generator." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/fingerprint_suite/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 30 + } + ], + "type": { + "name": "Annotated[float | None, Field(alias='maxHeight')]", + "type": "union", + "types": [ + { + "type": "reference", + "name": "float" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "args": "('Data structures')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 1965, + 1963, + 1964, + 1962, + 1961 + ], + "title": "Properties" + } + ], + "id": 1960, + "module": "fingerprint_suite._types", + "name": "ScreenOptions", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/fingerprint_suite/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 16 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1967, + "module": "fingerprint_suite._types", + "name": "model_config", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/fingerprint_suite/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 38 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "List of BrowserSpecifications to generate the headers for." + } + ] + }, + "flags": {}, + "groups": [], + "id": 1968, + "module": "fingerprint_suite._types", + "name": "browsers", + "parsedDocstring": { + "text": "List of BrowserSpecifications to generate the headers for." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/fingerprint_suite/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 40 + } + ], + "type": { + "name": "list[SupportedBrowserType] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "SupportedBrowserType", + "target": "1959" + } + ], + "target": "866" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "List of operating systems to generate the headers for." + } + ] + }, + "flags": {}, + "groups": [], + "id": 1969, + "module": "fingerprint_suite._types", + "name": "operating_systems", + "parsedDocstring": { + "text": "List of operating systems to generate the headers for." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/fingerprint_suite/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 43 + } + ], + "type": { + "name": "Annotated[list[SupportedOperatingSystems] | None, Field(alias='operatingSystems')]", + "type": "union", + "types": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "SupportedOperatingSystems", + "target": "1956" + } + ], + "target": "866" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "List of devices to generate the headers for." + } + ] + }, + "flags": {}, + "groups": [], + "id": 1970, + "module": "fingerprint_suite._types", + "name": "devices", + "parsedDocstring": { + "text": "List of devices to generate the headers for." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/fingerprint_suite/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 46 + } + ], + "type": { + "name": "list[SupportedDevices] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "SupportedDevices", + "target": "1957" + } + ], + "target": "866" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "List of at most 10 languages to include in the [Accept-Language]\n(https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Accept-Language) request header\nin the language format accepted by that header, for example `en`, `en-US` or `de`." + } + ] + }, + "flags": {}, + "groups": [], + "id": 1971, + "module": "fingerprint_suite._types", + "name": "locales", + "parsedDocstring": { + "text": "List of at most 10 languages to include in the [Accept-Language]\n(https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Accept-Language) request header\nin the language format accepted by that header, for example `en`, `en-US` or `de`." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/fingerprint_suite/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 49 + } + ], + "type": { + "name": "list[str] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ], + "target": "866" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP version to be used for header generation (the headers differ depending on the version)." + } + ] + }, + "flags": {}, + "groups": [], + "id": 1972, + "module": "fingerprint_suite._types", + "name": "http_version", + "parsedDocstring": { + "text": "HTTP version to be used for header generation (the headers differ depending on the version)." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/fingerprint_suite/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 54 + } + ], + "type": { + "name": "Annotated[SupportedHttpVersion | None, Field(alias='httpVersion')]", + "type": "union", + "types": [ + { + "type": "reference", + "name": "SupportedHttpVersion", + "target": "1958" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If true, the generator will throw an error if it cannot generate headers based on the input." + } + ] + }, + "flags": {}, + "groups": [], + "id": 1973, + "module": "fingerprint_suite._types", + "name": "strict", + "parsedDocstring": { + "text": "If true, the generator will throw an error if it cannot generate headers based on the input." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/fingerprint_suite/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 57 + } + ], + "type": { + "name": "bool | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "bool" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Collection of header related attributes that can be used by the fingerprint generator." + } + ] + }, + "decorations": [ + { + "args": "('Data structures')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 1968, + 1970, + 1972, + 1971, + 1967, + 1969, + 1973 + ], + "title": "Properties" + } + ], + "id": 1966, + "module": "fingerprint_suite._types", + "name": "HeaderGeneratorOptions", + "parsedDocstring": { + "text": "Collection of header related attributes that can be used by the fingerprint generator." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/fingerprint_suite/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 35 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1975, + "module": "fingerprint_suite._header_generator", + "name": "__init__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/fingerprint_suite/_header_generator.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 17 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 1976, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return subset of headers based on the selected `header_names`.\n\nIf no `header_names` are specified, full unfiltered headers are returned." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1977, + "module": "fingerprint_suite._header_generator", + "name": "get_specific_headers", + "parsedDocstring": { + "text": "Return subset of headers based on the selected `header_names`.\n\nIf no `header_names` are specified, full unfiltered headers are returned." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/fingerprint_suite/_header_generator.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 23 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return subset of headers based on the selected `header_names`.\n\nIf no `header_names` are specified, full unfiltered headers are returned." + } + ] + }, + "flags": {}, + "id": 1978, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "get_specific_headers", + "parameters": [ + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1979, + "kind": 32768, + "kindString": "Parameter", + "name": "header_names", + "type": { + "name": "set[str] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "set", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ], + "target": "1668" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "'chromium'", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1980, + "kind": 32768, + "kindString": "Parameter", + "name": "browser_type", + "type": { + "name": "SupportedBrowserType", + "type": "reference", + "target": "1959" + } + } + ], + "type": { + "name": "HttpHeaders", + "type": "reference", + "target": "123" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get common HTTP headers (\"Accept\", \"Accept-Language\").\n\nWe do not modify the \"Accept-Encoding\", \"Connection\" and other headers. They should be included and handled\nby the HTTP client or browser." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1981, + "module": "fingerprint_suite._header_generator", + "name": "get_common_headers", + "parsedDocstring": { + "text": "Get common HTTP headers (\"Accept\", \"Accept-Language\").\n\nWe do not modify the \"Accept-Encoding\", \"Connection\" and other headers. They should be included and handled\nby the HTTP client or browser." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/fingerprint_suite/_header_generator.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 36 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get common HTTP headers (\"Accept\", \"Accept-Language\").\n\nWe do not modify the \"Accept-Encoding\", \"Connection\" and other headers. They should be included and handled\nby the HTTP client or browser." + } + ] + }, + "flags": {}, + "id": 1982, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "get_common_headers", + "parameters": [], + "type": { + "name": "HttpHeaders", + "type": "reference", + "target": "123" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get a random User-Agent header." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1983, + "module": "fingerprint_suite._header_generator", + "name": "get_random_user_agent_header", + "parsedDocstring": { + "text": "Get a random User-Agent header." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/fingerprint_suite/_header_generator.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 45 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get a random User-Agent header." + } + ] + }, + "flags": {}, + "id": 1984, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "get_random_user_agent_header", + "parameters": [], + "type": { + "name": "HttpHeaders", + "type": "reference", + "target": "123" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the User-Agent header based on the browser type." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1985, + "module": "fingerprint_suite._header_generator", + "name": "get_user_agent_header", + "parsedDocstring": { + "text": "Get the User-Agent header based on the browser type." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/fingerprint_suite/_header_generator.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 50 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the User-Agent header based on the browser type." + } + ] + }, + "flags": {}, + "id": 1986, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "get_user_agent_header", + "parameters": [ + { + "defaultValue": "'chromium'", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1987, + "kind": 32768, + "kindString": "Parameter", + "name": "browser_type", + "type": { + "name": "SupportedBrowserType", + "type": "reference", + "target": "1959" + } + } + ], + "type": { + "name": "HttpHeaders", + "type": "reference", + "target": "123" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the sec-ch-ua headers based on the browser type." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1988, + "module": "fingerprint_suite._header_generator", + "name": "get_sec_ch_ua_headers", + "parsedDocstring": { + "text": "Get the sec-ch-ua headers based on the browser type." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/fingerprint_suite/_header_generator.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 61 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the sec-ch-ua headers based on the browser type." + } + ] + }, + "flags": {}, + "id": 1989, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "get_sec_ch_ua_headers", + "parameters": [ + { + "defaultValue": "'chromium'", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1990, + "kind": 32768, + "kindString": "Parameter", + "name": "browser_type", + "type": { + "name": "SupportedBrowserType", + "type": "reference", + "target": "1959" + } + } + ], + "type": { + "name": "HttpHeaders", + "type": "reference", + "target": "123" + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Generate realistic looking or browser-like HTTP headers." + } + ] + }, + "decorations": [ + { + "args": "('Classes')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 1975, + 1981, + 1983, + 1988, + 1977, + 1985 + ], + "title": "Methods" + } + ], + "id": 1974, + "module": "fingerprint_suite._header_generator", + "name": "HeaderGenerator", + "parsedDocstring": { + "text": "Generate realistic looking or browser-like HTTP headers." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/fingerprint_suite/_header_generator.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 14 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Generate browser fingerprints.\n\nThis is experimental feature.\nReturn type is temporarily set to `Fingerprint` from `browserforge`. This is subject to change and most likely\nit will change to custom `Fingerprint` class defined in this repo later." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1992, + "module": "fingerprint_suite._fingerprint_generator", + "name": "generate", + "parsedDocstring": { + "text": "Generate browser fingerprints.\n\nThis is experimental feature.\nReturn type is temporarily set to `Fingerprint` from `browserforge`. This is subject to change and most likely\nit will change to custom `Fingerprint` class defined in this repo later." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/fingerprint_suite/_fingerprint_generator.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 17 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Generate browser fingerprints.\n\nThis is experimental feature.\nReturn type is temporarily set to `Fingerprint` from `browserforge`. This is subject to change and most likely\nit will change to custom `Fingerprint` class defined in this repo later." + } + ] + }, + "flags": {}, + "id": 1993, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "generate", + "parameters": [], + "type": { + "name": "Fingerprint", + "type": "reference" + }, + "overwrites": { + "name": "FingerprintGenerator.generate", + "target": 1992, + "type": "reference" + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A class for creating browser fingerprints that mimic browser fingerprints of real users." + } + ] + }, + "decorations": [ + { + "args": "('Abstract classes')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 1992 + ], + "title": "Methods" + } + ], + "id": 1991, + "module": "fingerprint_suite._fingerprint_generator", + "name": "FingerprintGenerator", + "parsedDocstring": { + "text": "A class for creating browser fingerprints that mimic browser fingerprints of real users." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/fingerprint_suite/_fingerprint_generator.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 13 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedBy": [ + { + "name": "BrowserforgeFingerprintGenerator", + "target": "2015", + "type": "reference" + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1994, + "module": "fingerprint_suite._consts", + "name": "COMMON_ACCEPT_LANGUAGE", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/fingerprint_suite/_consts.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 5 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1995, + "module": "fingerprint_suite._consts", + "name": "BROWSER_TYPE_HEADER_KEYWORD", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/fingerprint_suite/_consts.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 7 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Generate HTTP headers based on the specified parameters.\n\nFor detailed description of the original method see: `browserforge.headers.generator.HeaderGenerator.generate`\nThis patched version of the method adds additional quality checks on the output of the original method. It tries\nto generate headers several times until they match the requirements.\n\nThe `browser` parameter accepts `chromium` as a general category, which includes not only Google Chrome\nbut also other Chromium-based browsers. As a result, a Safari-like user agent may be generated for a `chromium`\ninput, such as:\n```\nMozilla/5.0 (iPhone; CPU iPhone OS 18_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko)\n CriOS/130.0.6723.90 Mobile/15E148 Safari/604.1\n```\n\nTo maintain consistency with previous implementations, only a subset of Chromium headers will be allowed.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1997, + "module": "fingerprint_suite._browserforge_adapter", + "name": "generate", + "parsedDocstring": { + "text": "Generate HTTP headers based on the specified parameters.\n\nFor detailed description of the original method see: `browserforge.headers.generator.HeaderGenerator.generate`\nThis patched version of the method adds additional quality checks on the output of the original method. It tries\nto generate headers several times until they match the requirements.\n\nThe `browser` parameter accepts `chromium` as a general category, which includes not only Google Chrome\nbut also other Chromium-based browsers. As a result, a Safari-like user agent may be generated for a `chromium`\ninput, such as:\n```\nMozilla/5.0 (iPhone; CPU iPhone OS 18_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko)\n CriOS/130.0.6723.90 Mobile/15E148 Safari/604.1\n```\n\nTo maintain consistency with previous implementations, only a subset of Chromium headers will be allowed.\n", + "returns": "A generated headers." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/fingerprint_suite/_browserforge_adapter.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 46 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "A generated headers." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Generate HTTP headers based on the specified parameters.\n\nFor detailed description of the original method see: `browserforge.headers.generator.HeaderGenerator.generate`\nThis patched version of the method adds additional quality checks on the output of the original method. It tries\nto generate headers several times until they match the requirements.\n\nThe `browser` parameter accepts `chromium` as a general category, which includes not only Google Chrome\nbut also other Chromium-based browsers. As a result, a Safari-like user agent may be generated for a `chromium`\ninput, such as:\n```\nMozilla/5.0 (iPhone; CPU iPhone OS 18_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko)\n CriOS/130.0.6723.90 Mobile/15E148 Safari/604.1\n```\n\nTo maintain consistency with previous implementations, only a subset of Chromium headers will be allowed.\n" + } + ] + }, + "flags": {}, + "id": 1998, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "generate", + "parameters": [ + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1999, + "kind": 32768, + "kindString": "Parameter", + "name": "browser", + "type": { + "name": "Iterable[str | Browser] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Iterable", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Browser" + } + ] + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2000, + "kind": 32768, + "kindString": "Parameter", + "name": "os", + "type": { + "name": "ListOrString | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ListOrString" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2001, + "kind": 32768, + "kindString": "Parameter", + "name": "device", + "type": { + "name": "ListOrString | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ListOrString" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2002, + "kind": 32768, + "kindString": "Parameter", + "name": "locale", + "type": { + "name": "ListOrString | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ListOrString" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2003, + "kind": 32768, + "kindString": "Parameter", + "name": "http_version", + "type": { + "name": "Literal[1, 2] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Literal", + "typeArguments": [ + { + "type": "literal", + "value": 1 + }, + { + "type": "literal", + "value": 2 + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2004, + "kind": 32768, + "kindString": "Parameter", + "name": "user_agent", + "type": { + "name": "ListOrString | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ListOrString" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2005, + "kind": 32768, + "kindString": "Parameter", + "name": "strict", + "type": { + "name": "bool | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "bool" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2006, + "kind": 32768, + "kindString": "Parameter", + "name": "request_dependent_headers", + "type": { + "name": "dict[str, str] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "dict", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "str" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "dict", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "str" + } + ] + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Browserforge `HeaderGenerator` that contains patches specific for our usage of the generator." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 1997 + ], + "title": "Methods" + } + ], + "id": 1996, + "module": "fingerprint_suite._browserforge_adapter", + "name": "PatchedHeaderGenerator", + "parsedDocstring": { + "text": "Browserforge `HeaderGenerator` that contains patches specific for our usage of the generator." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/fingerprint_suite/_browserforge_adapter.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 28 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2008, + "module": "fingerprint_suite._browserforge_adapter", + "name": "__init__", + "parsedDocstring": { + "text": "Initialize a new instance.\n", + "args": { + "screen": "Screen constraints for the generated fingerprint.", + "strict": "Whether to raise an exception if the constraints are too strict.", + "mock_webrtc": "Whether to mock WebRTC when injecting the fingerprint.", + "slim": "Disables performance-heavy evasions when injecting the fingerprint.", + "**header_kwargs": "Header generation options for `HeaderGenerator`." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/fingerprint_suite/_browserforge_adapter.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 158 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n" + } + ] + }, + "flags": {}, + "id": 2009, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Screen constraints for the generated fingerprint." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2010, + "kind": 32768, + "kindString": "Parameter", + "name": "screen", + "type": { + "name": "Screen | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Screen" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Whether to raise an exception if the constraints are too strict." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2011, + "kind": 32768, + "kindString": "Parameter", + "name": "strict", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Whether to mock WebRTC when injecting the fingerprint." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2012, + "kind": 32768, + "kindString": "Parameter", + "name": "mock_webrtc", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Disables performance-heavy evasions when injecting the fingerprint." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2013, + "kind": 32768, + "kindString": "Parameter", + "name": "slim", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2014, + "kind": 32768, + "kindString": "Parameter", + "name": "header_kwargs", + "type": { + "name": "Undefined", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Browserforge `FingerprintGenerator` that contains patches not accepted in upstream repo." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 2008 + ], + "title": "Methods" + } + ], + "id": 2007, + "module": "fingerprint_suite._browserforge_adapter", + "name": "PatchedFingerprintGenerator", + "parsedDocstring": { + "text": "Browserforge `FingerprintGenerator` that contains patches not accepted in upstream repo." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/fingerprint_suite/_browserforge_adapter.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 155 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n\nAll generator options are optional. If any value is not specified, then `None` is set in the options.\nDefault values for options set to `None` are implementation detail of used fingerprint generator.\nSpecific default values should not be relied upon. Use explicit values if it matters for your use case.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2016, + "module": "fingerprint_suite._browserforge_adapter", + "name": "__init__", + "parsedDocstring": { + "text": "Initialize a new instance.\n\nAll generator options are optional. If any value is not specified, then `None` is set in the options.\nDefault values for options set to `None` are implementation detail of used fingerprint generator.\nSpecific default values should not be relied upon. Use explicit values if it matters for your use case.\n", + "args": { + "header_options": "Collection of header related attributes that can be used by the fingerprint generator.", + "screen_options": "Defines the screen constrains for the fingerprint generator.", + "mock_web_rtc": "Whether to mock WebRTC when injecting the fingerprint.", + "slim": "Disables performance-heavy evasions when injecting the fingerprint." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/fingerprint_suite/_browserforge_adapter.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 188 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n\nAll generator options are optional. If any value is not specified, then `None` is set in the options.\nDefault values for options set to `None` are implementation detail of used fingerprint generator.\nSpecific default values should not be relied upon. Use explicit values if it matters for your use case.\n" + } + ] + }, + "flags": {}, + "id": 2017, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Collection of header related attributes that can be used by the fingerprint generator." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2018, + "kind": 32768, + "kindString": "Parameter", + "name": "header_options", + "type": { + "name": "HeaderGeneratorOptions | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "HeaderGeneratorOptions", + "target": "1966" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Defines the screen constrains for the fingerprint generator." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2019, + "kind": 32768, + "kindString": "Parameter", + "name": "screen_options", + "type": { + "name": "ScreenOptions | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ScreenOptions", + "target": "1960" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Whether to mock WebRTC when injecting the fingerprint." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2020, + "kind": 32768, + "kindString": "Parameter", + "name": "mock_web_rtc", + "type": { + "name": "bool | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "bool" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Disables performance-heavy evasions when injecting the fingerprint." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2021, + "kind": 32768, + "kindString": "Parameter", + "name": "slim", + "type": { + "name": "bool | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "bool" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Generate browser fingerprints.\n\nThis is experimental feature.\nReturn type is temporarily set to `Fingerprint` from `browserforge`. This is subject to change and most likely\nit will change to custom `Fingerprint` class defined in this repo later." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2022, + "module": "fingerprint_suite._fingerprint_generator", + "name": "generate", + "parsedDocstring": { + "text": "Generate browser fingerprints.\n\nThis is experimental feature.\nReturn type is temporarily set to `Fingerprint` from `browserforge`. This is subject to change and most likely\nit will change to custom `Fingerprint` class defined in this repo later." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/fingerprint_suite/_browserforge_adapter.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 228 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Generate browser fingerprints.\n\nThis is experimental feature.\nReturn type is temporarily set to `Fingerprint` from `browserforge`. This is subject to change and most likely\nit will change to custom `Fingerprint` class defined in this repo later." + } + ] + }, + "flags": {}, + "id": 1993, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "generate", + "parameters": [], + "type": { + "name": "Fingerprint", + "type": "reference" + }, + "overwrites": { + "name": "FingerprintGenerator.generate", + "target": 1992, + "type": "reference" + } + } + ], + "overwrites": { + "name": "FingerprintGenerator.generate", + "target": 1992, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "`FingerprintGenerator` adapter for fingerprint generator from `browserforge`.\n\n`browserforge` is a browser header and fingerprint generator: https://github.com/daijro/browserforge" + } + ] + }, + "decorations": [ + { + "args": "('Classes')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 2016, + 2022 + ], + "title": "Methods" + } + ], + "id": 2015, + "module": "fingerprint_suite._browserforge_adapter", + "name": "BrowserforgeFingerprintGenerator", + "parsedDocstring": { + "text": "`FingerprintGenerator` adapter for fingerprint generator from `browserforge`.\n\n`browserforge` is a browser header and fingerprint generator: https://github.com/daijro/browserforge" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/fingerprint_suite/_browserforge_adapter.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 182 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "FingerprintGenerator", + "target": "1991", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2025, + "module": "fingerprint_suite._browserforge_adapter", + "name": "__init__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/fingerprint_suite/_browserforge_adapter.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 246 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 2026, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Generate headers." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2027, + "module": "fingerprint_suite._browserforge_adapter", + "name": "generate", + "parsedDocstring": { + "text": "Generate headers." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/fingerprint_suite/_browserforge_adapter.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 249 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Generate headers." + } + ] + }, + "flags": {}, + "id": 2028, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "generate", + "parameters": [ + { + "defaultValue": "'chromium'", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 2029, + "kind": 32768, + "kindString": "Parameter", + "name": "browser_type", + "type": { + "name": "SupportedBrowserType", + "type": "reference", + "target": "1959" + } + } + ], + "type": { + "name": "dict", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "str" + } + ] + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "`HeaderGenerator` adapter for fingerprint generator from `browserforge`." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 2025, + 2027 + ], + "title": "Methods" + } + ], + "id": 2024, + "module": "fingerprint_suite._browserforge_adapter", + "name": "BrowserforgeHeaderGenerator", + "parsedDocstring": { + "text": "`HeaderGenerator` adapter for fingerprint generator from `browserforge`." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/fingerprint_suite/_browserforge_adapter.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 243 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get header network that contains possible header values." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2030, + "module": "fingerprint_suite._browserforge_adapter", + "name": "get_available_header_network", + "parsedDocstring": { + "text": "Get header network that contains possible header values." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/fingerprint_suite/_browserforge_adapter.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 254 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get header network that contains possible header values." + } + ] + }, + "flags": {}, + "id": 2031, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "get_available_header_network", + "parameters": [], + "type": { + "name": "dict", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get set of possible header values from available header network." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2032, + "module": "fingerprint_suite._browserforge_adapter", + "name": "get_available_header_values", + "parsedDocstring": { + "text": "Get set of possible header values from available header network." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/fingerprint_suite/_browserforge_adapter.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 263 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get set of possible header values from available header network." + } + ] + }, + "flags": {}, + "id": 2033, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "get_available_header_values", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2034, + "kind": 32768, + "kindString": "Parameter", + "name": "header_network", + "type": { + "name": "dict", + "type": "reference" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2035, + "kind": 32768, + "kindString": "Parameter", + "name": "node_name", + "type": { + "name": "str | set[str]", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "set", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ], + "target": "1668" + } + ] + } + } + ], + "type": { + "name": "set", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ], + "target": "1668" + } + } + ] + }, + { + "kind": 8, + "kindString": "Enumeration", + "children": [ + { + "kind": 16, + "kindString": "Enumeration Member", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2037, + "module": "events._types", + "name": "PERSIST_STATE", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/events/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 18 + } + ], + "type": { + "type": "literal", + "value": "'persistState'" + } + }, + { + "kind": 16, + "kindString": "Enumeration Member", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2038, + "module": "events._types", + "name": "SYSTEM_INFO", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/events/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 19 + } + ], + "type": { + "type": "literal", + "value": "'systemInfo'" + } + }, + { + "kind": 16, + "kindString": "Enumeration Member", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2039, + "module": "events._types", + "name": "MIGRATING", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/events/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 20 + } + ], + "type": { + "type": "literal", + "value": "'migrating'" + } + }, + { + "kind": 16, + "kindString": "Enumeration Member", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2040, + "module": "events._types", + "name": "ABORTING", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/events/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 21 + } + ], + "type": { + "type": "literal", + "value": "'aborting'" + } + }, + { + "kind": 16, + "kindString": "Enumeration Member", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2041, + "module": "events._types", + "name": "EXIT", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/events/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 22 + } + ], + "type": { + "type": "literal", + "value": "'exit'" + } + }, + { + "kind": 16, + "kindString": "Enumeration Member", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2042, + "module": "events._types", + "name": "SESSION_RETIRED", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/events/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 25 + } + ], + "type": { + "type": "literal", + "value": "'sessionRetired'" + } + }, + { + "kind": 16, + "kindString": "Enumeration Member", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2043, + "module": "events._types", + "name": "BROWSER_LAUNCHED", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/events/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 28 + } + ], + "type": { + "type": "literal", + "value": "'browserLaunched'" + } + }, + { + "kind": 16, + "kindString": "Enumeration Member", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2044, + "module": "events._types", + "name": "BROWSER_RETIRED", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/events/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 29 + } + ], + "type": { + "type": "literal", + "value": "'browserRetired'" + } + }, + { + "kind": 16, + "kindString": "Enumeration Member", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2045, + "module": "events._types", + "name": "BROWSER_CLOSED", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/events/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 30 + } + ], + "type": { + "type": "literal", + "value": "'browserClosed'" + } + }, + { + "kind": 16, + "kindString": "Enumeration Member", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2046, + "module": "events._types", + "name": "PAGE_CREATED", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/events/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 31 + } + ], + "type": { + "type": "literal", + "value": "'pageCreated'" + } + }, + { + "kind": 16, + "kindString": "Enumeration Member", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2047, + "module": "events._types", + "name": "PAGE_CLOSED", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/events/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 32 + } + ], + "type": { + "type": "literal", + "value": "'pageClosed'" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Names of all possible events that can be emitted using an `EventManager`." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 2040, + 2045, + 2043, + 2044, + 2041, + 2039, + 2047, + 2046, + 2037, + 2042, + 2038 + ], + "title": "Enumeration members" + } + ], + "id": 2036, + "module": "events._types", + "name": "Event", + "parsedDocstring": { + "text": "Names of all possible events that can be emitted using an `EventManager`." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/events/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 14 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2049, + "module": "events._types", + "name": "model_config", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/events/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 39 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2050, + "module": "events._types", + "name": "is_migrating", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/events/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 41 + } + ], + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Data for the persist state event." + } + ] + }, + "decorations": [ + { + "args": "('Event payloads')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 2050, + 2049 + ], + "title": "Properties" + } + ], + "id": 2048, + "module": "events._types", + "name": "EventPersistStateData", + "parsedDocstring": { + "text": "Data for the persist state event." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/events/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 36 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2052, + "module": "events._types", + "name": "model_config", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/events/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 48 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2053, + "module": "events._types", + "name": "cpu_info", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/events/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 50 + } + ], + "type": { + "name": "CpuInfo", + "type": "reference", + "target": "2882" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2054, + "module": "events._types", + "name": "memory_info", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/events/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 51 + } + ], + "type": { + "name": "MemoryUsageInfo", + "type": "reference", + "target": "2886" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Data for the system info event." + } + ] + }, + "decorations": [ + { + "args": "('Event payloads')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 2053, + 2054, + 2052 + ], + "title": "Properties" + } + ], + "id": 2051, + "module": "events._types", + "name": "EventSystemInfoData", + "parsedDocstring": { + "text": "Data for the system info event." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/events/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 45 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2056, + "module": "events._types", + "name": "model_config", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/events/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 61 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2057, + "module": "events._types", + "name": "time_remaining", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/events/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 65 + } + ], + "type": { + "name": "Annotated[timedelta_secs | None, Field(alias='timeRemainingSecs')]", + "type": "union", + "types": [ + { + "type": "reference", + "name": "timedelta_secs", + "target": "2974" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Data for the migrating event." + } + ] + }, + "decorations": [ + { + "args": "('Event payloads')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 2056, + 2057 + ], + "title": "Properties" + } + ], + "id": 2055, + "module": "events._types", + "name": "EventMigratingData", + "parsedDocstring": { + "text": "Data for the migrating event." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/events/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 58 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2059, + "module": "events._types", + "name": "model_config", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/events/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 72 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Data for the aborting event." + } + ] + }, + "decorations": [ + { + "args": "('Event payloads')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 2059 + ], + "title": "Properties" + } + ], + "id": 2058, + "module": "events._types", + "name": "EventAbortingData", + "parsedDocstring": { + "text": "Data for the aborting event." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/events/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 69 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2061, + "module": "events._types", + "name": "model_config", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/events/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 79 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Data for the exit event." + } + ] + }, + "decorations": [ + { + "args": "('Event payloads')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 2061 + ], + "title": "Properties" + } + ], + "id": 2060, + "module": "events._types", + "name": "EventExitData", + "parsedDocstring": { + "text": "Data for the exit event." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/events/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 76 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A helper type for all possible event payloads" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2062, + "module": "events._types", + "name": "EventData", + "parsedDocstring": { + "text": "A helper type for all possible event payloads" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/events/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 82 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2063, + "module": "events._types", + "name": "WrappedListener", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/events/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 85 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2064, + "module": "events._types", + "name": "TEvent", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/events/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 87 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "An event listener function - it can be both sync and async and may accept zero or one argument." + } + ] + }, + "flags": {}, + "groups": [], + "id": 2065, + "module": "events._types", + "name": "EventListener", + "parsedDocstring": { + "text": "An event listener function - it can be both sync and async and may accept zero or one argument." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/events/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 88 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2066, + "module": "events._local_event_manager", + "name": "logger", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/events/_local_event_manager.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 22 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n\nIn most cases, you should use the `from_config` constructor to create a new instance based on\nthe provided configuration.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2068, + "module": "events._local_event_manager", + "name": "__init__", + "parsedDocstring": { + "text": "Initialize a new instance.\n\nIn most cases, you should use the `from_config` constructor to create a new instance based on\nthe provided configuration.\n", + "args": { + "system_info_interval": "Interval at which `SystemInfo` events are emitted.", + "event_manager_options": "Additional options for the parent class." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/events/_local_event_manager.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 34 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n\nIn most cases, you should use the `from_config` constructor to create a new instance based on\nthe provided configuration.\n" + } + ] + }, + "flags": {}, + "id": 2069, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Interval at which `SystemInfo` events are emitted." + } + ] + }, + "defaultValue": "timedelta(seconds=1)", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 2070, + "kind": 32768, + "kindString": "Parameter", + "name": "system_info_interval", + "type": { + "name": "timedelta", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Interval between emitted `PersistState` events to maintain state persistence." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 2084, + "module": "events._event_manager", + "name": "persist_state_interval", + "parsedDocstring": { + "text": "Interval between emitted `PersistState` events to maintain state persistence." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/events/_event_manager.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 46 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "timedelta" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Optional timeout for canceling pending event listeners if they exceed this duration." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 2085, + "module": "events._event_manager", + "name": "close_timeout", + "parsedDocstring": { + "text": "Optional timeout for canceling pending event listeners if they exceed this duration." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/events/_event_manager.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 49 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "timedelta" + }, + { + "type": "literal", + "value": null + } + ] + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "EventManager.__init__", + "target": 2087, + "type": "reference" + } + } + ], + "overwrites": { + "name": "EventManager.__init__", + "target": 2087, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance based on the provided `Configuration`.\n" + } + ] + }, + "decorations": [ + { + "name": "classmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2072, + "module": "events._local_event_manager", + "name": "from_config", + "parsedDocstring": { + "text": "Initialize a new instance based on the provided `Configuration`.\n", + "args": { + "config": "The `Configuration` instance. Uses the global (default) one if not provided." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/events/_local_event_manager.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 59 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance based on the provided `Configuration`.\n" + } + ] + }, + "flags": {}, + "id": 2073, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "from_config", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The `Configuration` instance. Uses the global (default) one if not provided." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 2074, + "kind": 32768, + "kindString": "Parameter", + "name": "config", + "type": { + "name": "Configuration | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Configuration", + "target": "93" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "LocalEventManager", + "type": "reference", + "target": "2067" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize the local event manager upon entering the async context.\n\nIt starts emitting system info events at regular intervals." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2075, + "module": "events._local_event_manager", + "name": "__aenter__", + "parsedDocstring": { + "text": "Initialize the local event manager upon entering the async context.\n\nIt starts emitting system info events at regular intervals." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/events/_local_event_manager.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 72 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize the local event manager upon entering the async context.\n\nIt starts emitting system info events at regular intervals." + } + ] + }, + "flags": {}, + "id": 2076, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "__aenter__", + "parameters": [], + "type": { + "name": "LocalEventManager", + "type": "reference", + "target": "2067" + }, + "overwrites": { + "name": "EventManager.__aenter__", + "target": 2092, + "type": "reference" + } + } + ], + "overwrites": { + "name": "EventManager.__aenter__", + "target": 2092, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Close the local event manager upon exiting the async context.\n\nIt stops emitting system info events and closes the event manager." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2077, + "module": "events._local_event_manager", + "name": "__aexit__", + "parsedDocstring": { + "text": "Close the local event manager upon exiting the async context.\n\nIt stops emitting system info events and closes the event manager." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/events/_local_event_manager.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 81 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Close the local event manager upon exiting the async context.\n\nIt stops emitting system info events and closes the event manager." + } + ] + }, + "flags": {}, + "id": 2078, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "__aexit__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2079, + "kind": 32768, + "kindString": "Parameter", + "name": "exc_type", + "type": { + "name": "type[BaseException] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "type", + "typeArguments": [ + { + "type": "reference", + "name": "BaseException" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2080, + "kind": 32768, + "kindString": "Parameter", + "name": "exc_value", + "type": { + "name": "BaseException | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "BaseException" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2081, + "kind": 32768, + "kindString": "Parameter", + "name": "exc_traceback", + "type": { + "name": "TracebackType | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "TracebackType" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "EventManager.__aexit__", + "target": 2094, + "type": "reference" + } + } + ], + "overwrites": { + "name": "EventManager.__aexit__", + "target": 2094, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Indicate whether the context is active." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 3517, + "module": "events._event_manager", + "name": "active", + "parsedDocstring": { + "text": "Indicate whether the context is active." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/events/_event_manager.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 99 + } + ], + "type": { + "name": "bool", + "type": "reference" + }, + "inheritedFrom": { + "name": "EventManager.active", + "target": 2091, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register an event listener for a specific event.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3518, + "module": "events._event_manager", + "name": "on", + "parsedDocstring": { + "text": "Register an event listener for a specific event.\n", + "args": { + "event": "The event for which to listen to.", + "listener": "The function (sync or async) which is to be called when the event is emitted." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/events/_event_manager.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 152 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register an event listener for a specific event.\n" + } + ] + }, + "flags": {}, + "id": 2100, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "on", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The event for which to listen to." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2101, + "kind": 32768, + "kindString": "Parameter", + "name": "event", + "type": { + "name": "Event", + "type": "reference", + "target": "2036" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The function (sync or async) which is to be called when the event is emitted." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2102, + "kind": 32768, + "kindString": "Parameter", + "name": "listener", + "type": { + "name": "EventListener[Any]", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "EventManager.on", + "target": 2099, + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register an event listener for a specific event.\n" + } + ] + }, + "flags": {}, + "id": 2114, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "on", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The event for which to listen to." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2115, + "kind": 32768, + "kindString": "Parameter", + "name": "event", + "type": { + "name": "Literal[Event.PERSIST_STATE]", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The function (sync or async) which is to be called when the event is emitted." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2116, + "kind": 32768, + "kindString": "Parameter", + "name": "listener", + "type": { + "name": "EventListener[EventPersistStateData]", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "EventManager.on", + "target": 2099, + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register an event listener for a specific event.\n" + } + ] + }, + "flags": {}, + "id": 2117, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "on", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The event for which to listen to." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2118, + "kind": 32768, + "kindString": "Parameter", + "name": "event", + "type": { + "name": "Literal[Event.SYSTEM_INFO]", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The function (sync or async) which is to be called when the event is emitted." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2119, + "kind": 32768, + "kindString": "Parameter", + "name": "listener", + "type": { + "name": "EventListener[EventSystemInfoData]", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "EventManager.on", + "target": 2099, + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register an event listener for a specific event.\n" + } + ] + }, + "flags": {}, + "id": 2120, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "on", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The event for which to listen to." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2121, + "kind": 32768, + "kindString": "Parameter", + "name": "event", + "type": { + "name": "Literal[Event.MIGRATING]", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The function (sync or async) which is to be called when the event is emitted." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2122, + "kind": 32768, + "kindString": "Parameter", + "name": "listener", + "type": { + "name": "EventListener[EventMigratingData]", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "EventManager.on", + "target": 2099, + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register an event listener for a specific event.\n" + } + ] + }, + "flags": {}, + "id": 2123, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "on", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The event for which to listen to." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2124, + "kind": 32768, + "kindString": "Parameter", + "name": "event", + "type": { + "name": "Literal[Event.ABORTING]", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The function (sync or async) which is to be called when the event is emitted." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2125, + "kind": 32768, + "kindString": "Parameter", + "name": "listener", + "type": { + "name": "EventListener[EventAbortingData]", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "EventManager.on", + "target": 2099, + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register an event listener for a specific event.\n" + } + ] + }, + "flags": {}, + "id": 2126, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "on", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The event for which to listen to." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2127, + "kind": 32768, + "kindString": "Parameter", + "name": "event", + "type": { + "name": "Literal[Event.EXIT]", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The function (sync or async) which is to be called when the event is emitted." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2128, + "kind": 32768, + "kindString": "Parameter", + "name": "listener", + "type": { + "name": "EventListener[EventExitData]", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "EventManager.on", + "target": 2099, + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register an event listener for a specific event.\n" + } + ] + }, + "flags": {}, + "id": 2129, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "on", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The event for which to listen to." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2130, + "kind": 32768, + "kindString": "Parameter", + "name": "event", + "type": { + "name": "Event", + "type": "reference", + "target": "2036" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The function (sync or async) which is to be called when the event is emitted." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2131, + "kind": 32768, + "kindString": "Parameter", + "name": "listener", + "type": { + "name": "EventListener[None]", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "EventManager.on", + "target": 2099, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "EventManager.on", + "target": 2099, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Remove a specific listener or all listeners for an event.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3519, + "module": "events._event_manager", + "name": "off", + "parsedDocstring": { + "text": "Remove a specific listener or all listeners for an event.\n", + "args": { + "event": "The Actor event for which to remove listeners.", + "listener": "The listener which is supposed to be removed. If not passed, all listeners of this event\nare removed." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/events/_event_manager.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 198 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Remove a specific listener or all listeners for an event.\n" + } + ] + }, + "flags": {}, + "id": 2104, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "off", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The Actor event for which to remove listeners." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2105, + "kind": 32768, + "kindString": "Parameter", + "name": "event", + "type": { + "name": "Event", + "type": "reference", + "target": "2036" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The listener which is supposed to be removed. If not passed, all listeners of this event\nare removed." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2106, + "kind": 32768, + "kindString": "Parameter", + "name": "listener", + "type": { + "name": "EventListener[Any] | None", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "EventManager.off", + "target": 2103, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "EventManager.off", + "target": 2103, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Emit an event with the associated data to all registered listeners.\n" + } + ] + }, + "decorations": [ + { + "name": "ensure_context" + } + ], + "flags": {}, + "groups": [], + "id": 3520, + "module": "events._event_manager", + "name": "emit", + "parsedDocstring": { + "text": "Emit an event with the associated data to all registered listeners.\n", + "args": { + "event": "The event which will be emitted.", + "event_data": "The data which will be passed to the event listeners." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/events/_event_manager.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 228 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Emit an event with the associated data to all registered listeners.\n" + } + ] + }, + "flags": {}, + "id": 2108, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "emit", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The event which will be emitted." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2109, + "kind": 32768, + "kindString": "Parameter", + "name": "event", + "type": { + "name": "Event", + "type": "reference", + "target": "2036" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The data which will be passed to the event listeners." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2110, + "kind": 32768, + "kindString": "Parameter", + "name": "event_data", + "type": { + "name": "EventData", + "type": "reference", + "target": "2062" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "EventManager.emit", + "target": 2107, + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Emit an event with the associated data to all registered listeners.\n" + } + ] + }, + "flags": {}, + "id": 2132, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "emit", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The event which will be emitted." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2133, + "kind": 32768, + "kindString": "Parameter", + "name": "event", + "type": { + "name": "Literal[Event.PERSIST_STATE]", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The data which will be passed to the event listeners." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2134, + "kind": 32768, + "kindString": "Parameter", + "name": "event_data", + "type": { + "name": "EventPersistStateData", + "type": "reference", + "target": "2048" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "EventManager.emit", + "target": 2107, + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Emit an event with the associated data to all registered listeners.\n" + } + ] + }, + "flags": {}, + "id": 2135, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "emit", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The event which will be emitted." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2136, + "kind": 32768, + "kindString": "Parameter", + "name": "event", + "type": { + "name": "Literal[Event.SYSTEM_INFO]", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The data which will be passed to the event listeners." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2137, + "kind": 32768, + "kindString": "Parameter", + "name": "event_data", + "type": { + "name": "EventSystemInfoData", + "type": "reference", + "target": "2051" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "EventManager.emit", + "target": 2107, + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Emit an event with the associated data to all registered listeners.\n" + } + ] + }, + "flags": {}, + "id": 2138, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "emit", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The event which will be emitted." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2139, + "kind": 32768, + "kindString": "Parameter", + "name": "event", + "type": { + "name": "Literal[Event.MIGRATING]", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The data which will be passed to the event listeners." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2140, + "kind": 32768, + "kindString": "Parameter", + "name": "event_data", + "type": { + "name": "EventMigratingData", + "type": "reference", + "target": "2055" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "EventManager.emit", + "target": 2107, + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Emit an event with the associated data to all registered listeners.\n" + } + ] + }, + "flags": {}, + "id": 2141, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "emit", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The event which will be emitted." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2142, + "kind": 32768, + "kindString": "Parameter", + "name": "event", + "type": { + "name": "Literal[Event.ABORTING]", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The data which will be passed to the event listeners." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2143, + "kind": 32768, + "kindString": "Parameter", + "name": "event_data", + "type": { + "name": "EventAbortingData", + "type": "reference", + "target": "2058" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "EventManager.emit", + "target": 2107, + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Emit an event with the associated data to all registered listeners.\n" + } + ] + }, + "flags": {}, + "id": 2144, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "emit", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The event which will be emitted." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2145, + "kind": 32768, + "kindString": "Parameter", + "name": "event", + "type": { + "name": "Literal[Event.EXIT]", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The data which will be passed to the event listeners." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2146, + "kind": 32768, + "kindString": "Parameter", + "name": "event_data", + "type": { + "name": "EventExitData", + "type": "reference", + "target": "2060" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "EventManager.emit", + "target": 2107, + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Emit an event with the associated data to all registered listeners.\n" + } + ] + }, + "flags": {}, + "id": 2147, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "emit", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The event which will be emitted." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2148, + "kind": 32768, + "kindString": "Parameter", + "name": "event", + "type": { + "name": "Event", + "type": "reference", + "target": "2036" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The data which will be passed to the event listeners." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2149, + "kind": 32768, + "kindString": "Parameter", + "name": "event_data", + "type": { + "name": "Any", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "EventManager.emit", + "target": 2107, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "EventManager.emit", + "target": 2107, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Wait for all currently executing event listeners to complete.\n" + } + ] + }, + "decorations": [ + { + "name": "ensure_context" + } + ], + "flags": {}, + "groups": [], + "id": 3521, + "module": "events._event_manager", + "name": "wait_for_all_listeners_to_complete", + "parsedDocstring": { + "text": "Wait for all currently executing event listeners to complete.\n", + "args": { + "timeout": "The maximum time to wait for the event listeners to finish. If they do not complete within\nthe specified timeout, they will be canceled." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/events/_event_manager.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 238 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Wait for all currently executing event listeners to complete.\n" + } + ] + }, + "flags": {}, + "id": 2112, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "wait_for_all_listeners_to_complete", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The maximum time to wait for the event listeners to finish. If they do not complete within\nthe specified timeout, they will be canceled." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2113, + "kind": 32768, + "kindString": "Parameter", + "name": "timeout", + "type": { + "name": "timedelta | None", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "EventManager.wait_for_all_listeners_to_complete", + "target": 2111, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "EventManager.wait_for_all_listeners_to_complete", + "target": 2111, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Event manager for local environments.\n\nIt extends the `EventManager` to emit `SystemInfo` events at regular intervals. The `LocalEventManager`\nis intended to be used in local environments, where the system metrics are required managing the `Snapshotter`\nand `AutoscaledPool`." + } + ] + }, + "decorations": [ + { + "args": "('Classes')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 2075, + 2077, + 2068, + 3520, + 2072, + 3519, + 3518, + 3521 + ], + "title": "Methods" + }, + { + "children": [ + 3517 + ], + "title": "Properties" + } + ], + "id": 2067, + "module": "events._local_event_manager", + "name": "LocalEventManager", + "parsedDocstring": { + "text": "Event manager for local environments.\n\nIt extends the `EventManager` to emit `SystemInfo` events at regular intervals. The `LocalEventManager`\nis intended to be used in local environments, where the system metrics are required managing the `Snapshotter`\nand `AutoscaledPool`." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/events/_local_event_manager.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 26 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "EventManager", + "target": "2086", + "type": "reference" + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2082, + "module": "events._event_manager", + "name": "logger", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/events/_event_manager.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 37 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Interval between emitted `PersistState` events to maintain state persistence." + } + ] + }, + "flags": {}, + "groups": [], + "id": 2084, + "module": "events._event_manager", + "name": "persist_state_interval", + "parsedDocstring": { + "text": "Interval between emitted `PersistState` events to maintain state persistence." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/events/_event_manager.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 46 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "timedelta" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Optional timeout for canceling pending event listeners if they exceed this duration." + } + ] + }, + "flags": {}, + "groups": [], + "id": 2085, + "module": "events._event_manager", + "name": "close_timeout", + "parsedDocstring": { + "text": "Optional timeout for canceling pending event listeners if they exceed this duration." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/events/_event_manager.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 49 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "timedelta" + }, + { + "type": "literal", + "value": null + } + ] + } + ] + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Arguments for the `EventManager` constructor.\n\nIt is intended for typing forwarded `__init__` arguments in the subclasses." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 2085, + 2084 + ], + "title": "Properties" + } + ], + "id": 2083, + "module": "events._event_manager", + "name": "EventManagerOptions", + "parsedDocstring": { + "text": "Arguments for the `EventManager` constructor.\n\nIt is intended for typing forwarded `__init__` arguments in the subclasses." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/events/_event_manager.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 40 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2087, + "module": "events._event_manager", + "name": "__init__", + "parsedDocstring": { + "text": "Initialize a new instance.\n", + "args": { + "persist_state_interval": "Interval between emitted `PersistState` events to maintain state persistence.", + "close_timeout": "Optional timeout for canceling pending event listeners if they exceed this duration." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/events/_event_manager.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 62 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n" + } + ] + }, + "flags": {}, + "id": 2088, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Interval between emitted `PersistState` events to maintain state persistence." + } + ] + }, + "defaultValue": "timedelta(minutes=1)", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2089, + "kind": 32768, + "kindString": "Parameter", + "name": "persist_state_interval", + "type": { + "name": "timedelta", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Optional timeout for canceling pending event listeners if they exceed this duration." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2090, + "kind": 32768, + "kindString": "Parameter", + "name": "close_timeout", + "type": { + "name": "timedelta | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "timedelta" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Indicate whether the context is active." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 2091, + "module": "events._event_manager", + "name": "active", + "parsedDocstring": { + "text": "Indicate whether the context is active." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/events/_event_manager.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 99 + } + ], + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize the event manager upon entering the async context.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2092, + "module": "events._event_manager", + "name": "__aenter__", + "parsedDocstring": { + "text": "Initialize the event manager upon entering the async context.\n" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/events/_event_manager.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 103 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize the event manager upon entering the async context.\n" + } + ] + }, + "flags": {}, + "id": 2093, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "__aenter__", + "parameters": [], + "type": { + "name": "EventManager", + "type": "reference", + "target": "2086" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Close the local event manager upon exiting the async context.\n\nThis will stop listening for the events, and it will wait for all the event listeners to finish.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2094, + "module": "events._event_manager", + "name": "__aexit__", + "parsedDocstring": { + "text": "Close the local event manager upon exiting the async context.\n\nThis will stop listening for the events, and it will wait for all the event listeners to finish.\n" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/events/_event_manager.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 116 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Close the local event manager upon exiting the async context.\n\nThis will stop listening for the events, and it will wait for all the event listeners to finish.\n" + } + ] + }, + "flags": {}, + "id": 2095, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "__aexit__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2096, + "kind": 32768, + "kindString": "Parameter", + "name": "exc_type", + "type": { + "name": "type[BaseException] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "type", + "typeArguments": [ + { + "type": "reference", + "name": "BaseException" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2097, + "kind": 32768, + "kindString": "Parameter", + "name": "exc_value", + "type": { + "name": "BaseException | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "BaseException" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2098, + "kind": 32768, + "kindString": "Parameter", + "name": "exc_traceback", + "type": { + "name": "TracebackType | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "TracebackType" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register an event listener for a specific event.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2099, + "module": "events._event_manager", + "name": "on", + "parsedDocstring": { + "text": "Register an event listener for a specific event.\n", + "args": { + "event": "The event for which to listen to.", + "listener": "The function (sync or async) which is to be called when the event is emitted." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/events/_event_manager.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 152 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register an event listener for a specific event.\n" + } + ] + }, + "flags": {}, + "id": 2100, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "on", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The event for which to listen to." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2101, + "kind": 32768, + "kindString": "Parameter", + "name": "event", + "type": { + "name": "Event", + "type": "reference", + "target": "2036" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The function (sync or async) which is to be called when the event is emitted." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2102, + "kind": 32768, + "kindString": "Parameter", + "name": "listener", + "type": { + "name": "EventListener", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Any" + } + ], + "target": "2065" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register an event listener for a specific event.\n" + } + ] + }, + "flags": {}, + "id": 2114, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "on", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The event for which to listen to." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2115, + "kind": 32768, + "kindString": "Parameter", + "name": "event", + "type": { + "name": "Literal", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Event.PERSIST_STATE" + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The function (sync or async) which is to be called when the event is emitted." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2116, + "kind": 32768, + "kindString": "Parameter", + "name": "listener", + "type": { + "name": "EventListener", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "EventPersistStateData", + "target": "2048" + } + ], + "target": "2065" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register an event listener for a specific event.\n" + } + ] + }, + "flags": {}, + "id": 2117, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "on", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The event for which to listen to." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2118, + "kind": 32768, + "kindString": "Parameter", + "name": "event", + "type": { + "name": "Literal", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Event.SYSTEM_INFO" + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The function (sync or async) which is to be called when the event is emitted." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2119, + "kind": 32768, + "kindString": "Parameter", + "name": "listener", + "type": { + "name": "EventListener", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "EventSystemInfoData", + "target": "2051" + } + ], + "target": "2065" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register an event listener for a specific event.\n" + } + ] + }, + "flags": {}, + "id": 2120, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "on", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The event for which to listen to." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2121, + "kind": 32768, + "kindString": "Parameter", + "name": "event", + "type": { + "name": "Literal", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Event.MIGRATING" + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The function (sync or async) which is to be called when the event is emitted." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2122, + "kind": 32768, + "kindString": "Parameter", + "name": "listener", + "type": { + "name": "EventListener", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "EventMigratingData", + "target": "2055" + } + ], + "target": "2065" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register an event listener for a specific event.\n" + } + ] + }, + "flags": {}, + "id": 2123, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "on", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The event for which to listen to." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2124, + "kind": 32768, + "kindString": "Parameter", + "name": "event", + "type": { + "name": "Literal", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Event.ABORTING" + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The function (sync or async) which is to be called when the event is emitted." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2125, + "kind": 32768, + "kindString": "Parameter", + "name": "listener", + "type": { + "name": "EventListener", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "EventAbortingData", + "target": "2058" + } + ], + "target": "2065" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register an event listener for a specific event.\n" + } + ] + }, + "flags": {}, + "id": 2126, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "on", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The event for which to listen to." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2127, + "kind": 32768, + "kindString": "Parameter", + "name": "event", + "type": { + "name": "Literal", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Event.EXIT" + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The function (sync or async) which is to be called when the event is emitted." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2128, + "kind": 32768, + "kindString": "Parameter", + "name": "listener", + "type": { + "name": "EventListener", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "EventExitData", + "target": "2060" + } + ], + "target": "2065" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register an event listener for a specific event.\n" + } + ] + }, + "flags": {}, + "id": 2129, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "on", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The event for which to listen to." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2130, + "kind": 32768, + "kindString": "Parameter", + "name": "event", + "type": { + "name": "Event", + "type": "reference", + "target": "2036" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The function (sync or async) which is to be called when the event is emitted." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2131, + "kind": 32768, + "kindString": "Parameter", + "name": "listener", + "type": { + "name": "EventListener", + "type": "reference", + "typeArguments": [ + { + "type": "literal", + "value": null + } + ], + "target": "2065" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Remove a specific listener or all listeners for an event.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2103, + "module": "events._event_manager", + "name": "off", + "parsedDocstring": { + "text": "Remove a specific listener or all listeners for an event.\n", + "args": { + "event": "The Actor event for which to remove listeners.", + "listener": "The listener which is supposed to be removed. If not passed, all listeners of this event\nare removed." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/events/_event_manager.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 198 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Remove a specific listener or all listeners for an event.\n" + } + ] + }, + "flags": {}, + "id": 2104, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "off", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The Actor event for which to remove listeners." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2105, + "kind": 32768, + "kindString": "Parameter", + "name": "event", + "type": { + "name": "Event", + "type": "reference", + "target": "2036" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The listener which is supposed to be removed. If not passed, all listeners of this event\nare removed." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2106, + "kind": 32768, + "kindString": "Parameter", + "name": "listener", + "type": { + "name": "EventListener[Any] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "EventListener", + "typeArguments": [ + { + "type": "reference", + "name": "Any" + } + ], + "target": "2065" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Emit an event with the associated data to all registered listeners.\n" + } + ] + }, + "decorations": [ + { + "name": "ensure_context" + } + ], + "flags": {}, + "groups": [], + "id": 2107, + "module": "events._event_manager", + "name": "emit", + "parsedDocstring": { + "text": "Emit an event with the associated data to all registered listeners.\n", + "args": { + "event": "The event which will be emitted.", + "event_data": "The data which will be passed to the event listeners." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/events/_event_manager.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 228 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Emit an event with the associated data to all registered listeners.\n" + } + ] + }, + "flags": {}, + "id": 2108, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "emit", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The event which will be emitted." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2109, + "kind": 32768, + "kindString": "Parameter", + "name": "event", + "type": { + "name": "Event", + "type": "reference", + "target": "2036" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The data which will be passed to the event listeners." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2110, + "kind": 32768, + "kindString": "Parameter", + "name": "event_data", + "type": { + "name": "EventData", + "type": "reference", + "target": "2062" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Emit an event with the associated data to all registered listeners.\n" + } + ] + }, + "flags": {}, + "id": 2132, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "emit", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The event which will be emitted." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2133, + "kind": 32768, + "kindString": "Parameter", + "name": "event", + "type": { + "name": "Literal", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Event.PERSIST_STATE" + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The data which will be passed to the event listeners." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2134, + "kind": 32768, + "kindString": "Parameter", + "name": "event_data", + "type": { + "name": "EventPersistStateData", + "type": "reference", + "target": "2048" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Emit an event with the associated data to all registered listeners.\n" + } + ] + }, + "flags": {}, + "id": 2135, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "emit", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The event which will be emitted." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2136, + "kind": 32768, + "kindString": "Parameter", + "name": "event", + "type": { + "name": "Literal", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Event.SYSTEM_INFO" + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The data which will be passed to the event listeners." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2137, + "kind": 32768, + "kindString": "Parameter", + "name": "event_data", + "type": { + "name": "EventSystemInfoData", + "type": "reference", + "target": "2051" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Emit an event with the associated data to all registered listeners.\n" + } + ] + }, + "flags": {}, + "id": 2138, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "emit", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The event which will be emitted." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2139, + "kind": 32768, + "kindString": "Parameter", + "name": "event", + "type": { + "name": "Literal", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Event.MIGRATING" + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The data which will be passed to the event listeners." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2140, + "kind": 32768, + "kindString": "Parameter", + "name": "event_data", + "type": { + "name": "EventMigratingData", + "type": "reference", + "target": "2055" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Emit an event with the associated data to all registered listeners.\n" + } + ] + }, + "flags": {}, + "id": 2141, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "emit", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The event which will be emitted." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2142, + "kind": 32768, + "kindString": "Parameter", + "name": "event", + "type": { + "name": "Literal", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Event.ABORTING" + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The data which will be passed to the event listeners." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2143, + "kind": 32768, + "kindString": "Parameter", + "name": "event_data", + "type": { + "name": "EventAbortingData", + "type": "reference", + "target": "2058" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Emit an event with the associated data to all registered listeners.\n" + } + ] + }, + "flags": {}, + "id": 2144, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "emit", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The event which will be emitted." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2145, + "kind": 32768, + "kindString": "Parameter", + "name": "event", + "type": { + "name": "Literal", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Event.EXIT" + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The data which will be passed to the event listeners." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2146, + "kind": 32768, + "kindString": "Parameter", + "name": "event_data", + "type": { + "name": "EventExitData", + "type": "reference", + "target": "2060" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Emit an event with the associated data to all registered listeners.\n" + } + ] + }, + "flags": {}, + "id": 2147, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "emit", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The event which will be emitted." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2148, + "kind": 32768, + "kindString": "Parameter", + "name": "event", + "type": { + "name": "Event", + "type": "reference", + "target": "2036" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The data which will be passed to the event listeners." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2149, + "kind": 32768, + "kindString": "Parameter", + "name": "event_data", + "type": { + "name": "Any", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Wait for all currently executing event listeners to complete.\n" + } + ] + }, + "decorations": [ + { + "name": "ensure_context" + } + ], + "flags": {}, + "groups": [], + "id": 2111, + "module": "events._event_manager", + "name": "wait_for_all_listeners_to_complete", + "parsedDocstring": { + "text": "Wait for all currently executing event listeners to complete.\n", + "args": { + "timeout": "The maximum time to wait for the event listeners to finish. If they do not complete within\nthe specified timeout, they will be canceled." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/events/_event_manager.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 238 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Wait for all currently executing event listeners to complete.\n" + } + ] + }, + "flags": {}, + "id": 2112, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "wait_for_all_listeners_to_complete", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The maximum time to wait for the event listeners to finish. If they do not complete within\nthe specified timeout, they will be canceled." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2113, + "kind": 32768, + "kindString": "Parameter", + "name": "timeout", + "type": { + "name": "timedelta | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "timedelta" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Manage events and their listeners, enabling registration, emission, and execution control.\n\nIt allows for registering event listeners, emitting events, and ensuring all listeners complete their execution.\nBuilt on top of `pyee.asyncio.AsyncIOEventEmitter`. It implements additional features such as waiting for all\nlisteners to complete and emitting `PersistState` events at regular intervals." + } + ] + }, + "decorations": [ + { + "args": "('Classes')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 2092, + 2094, + 2087, + 2107, + 2103, + 2099, + 2111 + ], + "title": "Methods" + }, + { + "children": [ + 2091 + ], + "title": "Properties" + } + ], + "id": 2086, + "module": "events._event_manager", + "name": "EventManager", + "parsedDocstring": { + "text": "Manage events and their listeners, enabling registration, emission, and execution control.\n\nIt allows for registering event listeners, emitting events, and ensuring all listeners complete their execution.\nBuilt on top of `pyee.asyncio.AsyncIOEventEmitter`. It implements additional features such as waiting for all\nlisteners to complete and emitting `PersistState` events at regular intervals." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/events/_event_manager.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 54 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedBy": [ + { + "name": "LocalEventManager", + "target": "2067", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2151, + "module": "crawlers._types", + "name": "reason", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 10 + } + ], + "type": { + "name": "str", + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "No reason means no blocking." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2152, + "module": "crawlers._types", + "name": "__bool__", + "parsedDocstring": { + "text": "No reason means no blocking." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 12 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "No reason means no blocking." + } + ] + }, + "flags": {}, + "id": 2153, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__bool__", + "parameters": [], + "type": { + "name": "bool", + "type": "reference" + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Information about whether the crawling is blocked. If reason is empty, then it means it is not blocked." + } + ] + }, + "decorations": [ + { + "args": "(frozen=True)", + "name": "dataclass" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 2152 + ], + "title": "Methods" + }, + { + "children": [ + 2151 + ], + "title": "Properties" + } + ], + "id": 2150, + "module": "crawlers._types", + "name": "BlockedInfo", + "parsedDocstring": { + "text": "Information about whether the crawling is blocked. If reason is empty, then it means it is not blocked." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 7 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Scroll to the bottom of a page, handling loading of additional items." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2154, + "module": "crawlers._playwright._utils", + "name": "infinite_scroll", + "parsedDocstring": { + "text": "Scroll to the bottom of a page, handling loading of additional items." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_playwright/_utils.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 25 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Scroll to the bottom of a page, handling loading of additional items." + } + ] + }, + "flags": {}, + "id": 2155, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "infinite_scroll", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2156, + "kind": 32768, + "kindString": "Parameter", + "name": "page", + "type": { + "name": "Page", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Blocks network requests matching specified URL patterns.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2157, + "module": "crawlers._playwright._utils", + "name": "block_requests", + "parsedDocstring": { + "text": "Blocks network requests matching specified URL patterns.\n", + "args": { + "page": "Playwright Page object to block requests on.", + "url_patterns": "List of URL patterns to block. If None, uses default patterns.", + "extra_url_patterns": "Additional URL patterns to append to the main patterns list." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_playwright/_utils.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 81 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Blocks network requests matching specified URL patterns.\n" + } + ] + }, + "flags": {}, + "id": 2158, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "block_requests", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Playwright Page object to block requests on." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2159, + "kind": 32768, + "kindString": "Parameter", + "name": "page", + "type": { + "name": "Page", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "List of URL patterns to block. If None, uses default patterns." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 2160, + "kind": 32768, + "kindString": "Parameter", + "name": "url_patterns", + "type": { + "name": "list[str] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ], + "target": "866" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Additional URL patterns to append to the main patterns list." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 2161, + "kind": 32768, + "kindString": "Parameter", + "name": "extra_url_patterns", + "type": { + "name": "list[str] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ], + "target": "866" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Call dunder method.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2163, + "module": "crawlers._playwright._types", + "name": "__call__", + "parsedDocstring": { + "text": "Call dunder method.\n", + "args": { + "url_patterns": "List of URL patterns to block. If None, uses default patterns.", + "extra_url_patterns": "Additional URL patterns to append to the main patterns list." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_playwright/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 24 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Call dunder method.\n" + } + ] + }, + "flags": {}, + "id": 2164, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "__call__", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "List of URL patterns to block. If None, uses default patterns." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 2165, + "kind": 32768, + "kindString": "Parameter", + "name": "url_patterns", + "type": { + "name": "list[str] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ], + "target": "866" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Additional URL patterns to append to the main patterns list." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 2166, + "kind": 32768, + "kindString": "Parameter", + "name": "extra_url_patterns", + "type": { + "name": "list[str] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ], + "target": "866" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A function for blocking unwanted HTTP requests during page loads in PlaywrightCrawler.\n\nIt simplifies the process of blocking specific HTTP requests during page navigation.\nThe function allows blocking both default resource types (like images, fonts, stylesheets) and custom URL patterns." + } + ] + }, + "decorations": [ + { + "args": "('Functions')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 2163 + ], + "title": "Methods" + } + ], + "id": 2162, + "module": "crawlers._playwright._types", + "name": "BlockRequestsFunction", + "parsedDocstring": { + "text": "A function for blocking unwanted HTTP requests during page loads in PlaywrightCrawler.\n\nIt simplifies the process of blocking specific HTTP requests during page navigation.\nThe function allows blocking both default resource types (like images, fonts, stylesheets) and custom URL patterns." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_playwright/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 17 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2168, + "module": "crawlers._playwright._types", + "name": "http_version", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_playwright/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 39 + } + ], + "type": { + "name": "str", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2169, + "module": "crawlers._playwright._types", + "name": "status_code", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_playwright/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 40 + } + ], + "type": { + "name": "int", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2170, + "module": "crawlers._playwright._types", + "name": "headers", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_playwright/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 41 + } + ], + "type": { + "name": "HttpHeaders", + "type": "reference", + "target": "123" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2171, + "module": "crawlers._playwright._types", + "name": "read", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_playwright/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 44 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 2172, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "read", + "parameters": [], + "type": { + "name": "bytes", + "type": "reference", + "target": "3048" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2173, + "module": "crawlers._playwright._types", + "name": "read_stream", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_playwright/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 47 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 2174, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "read_stream", + "parameters": [], + "type": { + "name": "AsyncGenerator", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bytes", + "target": "3048" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "name": "classmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2175, + "module": "crawlers._playwright._types", + "name": "from_playwright_response", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_playwright/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 53 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 2176, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "from_playwright_response", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2177, + "kind": 32768, + "kindString": "Parameter", + "name": "response", + "type": { + "name": "Response | APIResponse", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Response" + }, + { + "type": "reference", + "name": "APIResponse" + } + ] + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2178, + "kind": 32768, + "kindString": "Parameter", + "name": "protocol", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "Self", + "type": "reference" + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Wrapper class for playwright `Response` and `APIResponse` objects to implement `HttpResponse` protocol." + } + ] + }, + "decorations": [ + { + "args": "(frozen=True)", + "name": "dataclass" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 2175, + 2171, + 2173 + ], + "title": "Methods" + }, + { + "children": [ + 2170, + 2168, + 2169 + ], + "title": "Properties" + } + ], + "id": 2167, + "module": "crawlers._playwright._types", + "name": "PlaywrightHttpResponse", + "parsedDocstring": { + "text": "Wrapper class for playwright `Response` and `APIResponse` objects to implement `HttpResponse` protocol." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_playwright/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 36 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The Playwright `Page` object for the current page." + } + ] + }, + "flags": {}, + "groups": [], + "id": 2180, + "module": "crawlers._playwright._playwright_pre_nav_crawling_context", + "name": "page", + "parsedDocstring": { + "text": "The Playwright `Page` object for the current page." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 23 + } + ], + "type": { + "name": "Page", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Blocks network requests matching specified URL patterns." + } + ] + }, + "flags": {}, + "groups": [], + "id": 2181, + "module": "crawlers._playwright._playwright_pre_nav_crawling_context", + "name": "block_requests", + "parsedDocstring": { + "text": "Blocks network requests matching specified URL patterns." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 26 + } + ], + "type": { + "name": "BlockRequestsFunction", + "type": "reference", + "target": "2162" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get snapshot of crawled page." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2182, + "module": "crawlers._playwright._playwright_pre_nav_crawling_context", + "name": "get_snapshot", + "parsedDocstring": { + "text": "Get snapshot of crawled page." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 29 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get snapshot of crawled page." + } + ] + }, + "flags": {}, + "id": 2183, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_snapshot", + "parameters": [], + "type": { + "name": "PageSnapshot", + "type": "reference", + "target": "297" + }, + "overwrites": { + "name": "BasicCrawlingContext.get_snapshot", + "target": 319, + "type": "reference" + } + } + ], + "overwrites": { + "name": "BasicCrawlingContext.get_snapshot", + "target": 319, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Request object for the current page being processed." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3409, + "module": "_types", + "name": "request", + "parsedDocstring": { + "text": "Request object for the current page being processed." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 627 + } + ], + "type": { + "name": "Request", + "type": "reference", + "target": "398" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.request", + "target": 310, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Session object for the current page being processed." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3410, + "module": "_types", + "name": "session", + "parsedDocstring": { + "text": "Session object for the current page being processed." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 630 + } + ], + "type": { + "name": "Session | None", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.session", + "target": 311, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Proxy information for the current page being processed." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3411, + "module": "_types", + "name": "proxy_info", + "parsedDocstring": { + "text": "Proxy information for the current page being processed." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 633 + } + ], + "type": { + "name": "ProxyInfo | None", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.proxy_info", + "target": 312, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Send request crawling context helper function." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3412, + "module": "_types", + "name": "send_request", + "parsedDocstring": { + "text": "Send request crawling context helper function." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 636 + } + ], + "type": { + "name": "SendRequestFunction", + "type": "reference", + "target": "290" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.send_request", + "target": 313, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Add requests crawling context helper function." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3413, + "module": "_types", + "name": "add_requests", + "parsedDocstring": { + "text": "Add requests crawling context helper function." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 639 + } + ], + "type": { + "name": "AddRequestsFunction", + "type": "reference", + "target": "230" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.add_requests", + "target": 314, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Push data crawling context helper function." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3414, + "module": "_types", + "name": "push_data", + "parsedDocstring": { + "text": "Push data crawling context helper function." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 642 + } + ], + "type": { + "name": "PushDataFunction", + "type": "reference", + "target": "283" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.push_data", + "target": 315, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Use state crawling context helper function." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3415, + "module": "_types", + "name": "use_state", + "parsedDocstring": { + "text": "Use state crawling context helper function." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 645 + } + ], + "type": { + "name": "UseStateFunction", + "type": "reference", + "target": "305" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.use_state", + "target": 316, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get key-value store crawling context helper function." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3416, + "module": "_types", + "name": "get_key_value_store", + "parsedDocstring": { + "text": "Get key-value store crawling context helper function." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 648 + } + ], + "type": { + "name": "GetKeyValueStoreFromRequestHandlerFunction", + "type": "reference", + "target": "278" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.get_key_value_store", + "target": 317, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Logger instance." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3417, + "module": "_types", + "name": "log", + "parsedDocstring": { + "text": "Logger instance." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 651 + } + ], + "type": { + "name": "logging.Logger", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.log", + "target": 318, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return hash of the context. Each context is considered unique." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3418, + "module": "_types", + "name": "__hash__", + "parsedDocstring": { + "text": "Return hash of the context. Each context is considered unique." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 658 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return hash of the context. Each context is considered unique." + } + ] + }, + "flags": {}, + "id": 322, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__hash__", + "parameters": [], + "type": { + "name": "int", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.__hash__", + "target": 321, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawlingContext.__hash__", + "target": 321, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The pre navigation crawling context used by the `PlaywrightCrawler`.\n\nIt provides access to the `Page` object, before the navigation to the URL is performed." + } + ] + }, + "decorations": [ + { + "args": "(frozen=True)", + "name": "dataclass" + }, + { + "args": "('Data structures')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 3418, + 2182 + ], + "title": "Methods" + }, + { + "children": [ + 3413, + 2181, + 3416, + 3417, + 2180, + 3411, + 3414, + 3409, + 3412, + 3410, + 3415 + ], + "title": "Properties" + } + ], + "id": 2179, + "module": "crawlers._playwright._playwright_pre_nav_crawling_context", + "name": "PlaywrightPreNavCrawlingContext", + "parsedDocstring": { + "text": "The pre navigation crawling context used by the `PlaywrightCrawler`.\n\nIt provides access to the `Page` object, before the navigation to the URL is performed." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 17 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "BasicCrawlingContext", + "target": "309", + "type": "reference" + } + ], + "extendedBy": [ + { + "name": "PlaywrightCrawlingContext", + "target": "2215", + "type": "reference" + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Asynchronous context manager for setting the current Playwright page in the context variable." + } + ] + }, + "decorations": [ + { + "name": "asynccontextmanager" + } + ], + "flags": {}, + "groups": [], + "id": 2184, + "module": "crawlers._playwright._playwright_http_client", + "name": "browser_page_context", + "parsedDocstring": { + "text": "Asynchronous context manager for setting the current Playwright page in the context variable." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_playwright/_playwright_http_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 30 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Asynchronous context manager for setting the current Playwright page in the context variable." + } + ] + }, + "flags": {}, + "id": 2185, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "browser_page_context", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2186, + "kind": 32768, + "kindString": "Parameter", + "name": "page", + "type": { + "name": "Page", + "type": "reference" + } + } + ], + "type": { + "name": "AsyncGenerator", + "type": "reference", + "typeArguments": [ + { + "type": "literal", + "value": null + }, + { + "type": "literal", + "value": null + } + ] + } + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2188, + "module": "crawlers._playwright._playwright_http_client", + "name": "__init__", + "parsedDocstring": { + "text": "Initialize a new instance." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_playwright/_playwright_http_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 50 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance." + } + ] + }, + "flags": {}, + "id": 2189, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "HttpClient.__init__", + "target": 1920, + "type": "reference" + } + } + ], + "overwrites": { + "name": "HttpClient.__init__", + "target": 1920, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Perform the crawling for a given request.\n\nThis method is called from `crawler.run()`.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2190, + "module": "http_clients._base", + "name": "crawl", + "parsedDocstring": { + "text": "Perform the crawling for a given request.\n\nThis method is called from `crawler.run()`.\n", + "args": { + "request": "The request to be crawled.", + "session": "The session associated with the request.", + "proxy_info": "The information about the proxy to be used.", + "statistics": "The statistics object to register status codes.\n" + }, + "returns": "The result of the crawling." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_playwright/_playwright_http_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 55 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "The result of the crawling." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Perform the crawling for a given request.\n\nThis method is called from `crawler.run()`.\n" + } + ] + }, + "flags": {}, + "id": 1925, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "crawl", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The request to be crawled." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1926, + "kind": 32768, + "kindString": "Parameter", + "name": "request", + "type": { + "name": "Request", + "type": "reference", + "target": "398" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The session associated with the request." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1927, + "kind": 32768, + "kindString": "Parameter", + "name": "session", + "type": { + "name": "Session | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Session", + "target": "1575" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The information about the proxy to be used." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1928, + "kind": 32768, + "kindString": "Parameter", + "name": "proxy_info", + "type": { + "name": "ProxyInfo | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ProxyInfo", + "target": "15" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The statistics object to register status codes.\n" + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1929, + "kind": 32768, + "kindString": "Parameter", + "name": "statistics", + "type": { + "name": "Statistics | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Statistics", + "target": "1403" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "HttpCrawlingResult", + "type": "reference", + "target": "1917" + }, + "overwrites": { + "name": "HttpClient.crawl", + "target": 1924, + "type": "reference" + } + } + ], + "overwrites": { + "name": "HttpClient.crawl", + "target": 1924, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Send an HTTP request via the client.\n\nThis method is called from `context.send_request()` helper.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2196, + "module": "http_clients._base", + "name": "send_request", + "parsedDocstring": { + "text": "Send an HTTP request via the client.\n\nThis method is called from `context.send_request()` helper.\n", + "args": { + "url": "The URL to send the request to.", + "method": "The HTTP method to use.", + "headers": "The headers to include in the request.", + "payload": "The data to be sent as the request body.", + "session": "The session associated with the request.", + "proxy_info": "The information about the proxy to be used.\n" + }, + "returns": "The HTTP response received from the server." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_playwright/_playwright_http_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 66 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "The HTTP response received from the server." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Send an HTTP request via the client.\n\nThis method is called from `context.send_request()` helper.\n" + } + ] + }, + "flags": {}, + "id": 1931, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "send_request", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The URL to send the request to." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1932, + "kind": 32768, + "kindString": "Parameter", + "name": "url", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The HTTP method to use." + } + ] + }, + "defaultValue": "'GET'", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1933, + "kind": 32768, + "kindString": "Parameter", + "name": "method", + "type": { + "name": "Literal", + "type": "reference", + "typeArguments": [ + { + "type": "literal", + "value": "GET" + }, + { + "type": "literal", + "value": "HEAD" + }, + { + "type": "literal", + "value": "POST" + }, + { + "type": "literal", + "value": "PUT" + }, + { + "type": "literal", + "value": "DELETE" + }, + { + "type": "literal", + "value": "CONNECT" + }, + { + "type": "literal", + "value": "OPTIONS" + }, + { + "type": "literal", + "value": "TRACE" + }, + { + "type": "literal", + "value": "PATCH" + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The headers to include in the request." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1934, + "kind": 32768, + "kindString": "Parameter", + "name": "headers", + "type": { + "name": "HttpHeaders | dict[str, str] | None", + "type": "union", + "types": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "HttpHeaders", + "target": "123" + }, + { + "type": "reference", + "name": "dict", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "str" + } + ] + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The data to be sent as the request body." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1935, + "kind": 32768, + "kindString": "Parameter", + "name": "payload", + "type": { + "name": "HttpPayload | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "bytes", + "target": "3048" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The session associated with the request." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1936, + "kind": 32768, + "kindString": "Parameter", + "name": "session", + "type": { + "name": "Session | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Session", + "target": "1575" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The information about the proxy to be used.\n" + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1937, + "kind": 32768, + "kindString": "Parameter", + "name": "proxy_info", + "type": { + "name": "ProxyInfo | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ProxyInfo", + "target": "15" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "HttpResponse", + "type": "reference", + "target": "1909" + }, + "overwrites": { + "name": "HttpClient.send_request", + "target": 1930, + "type": "reference" + } + } + ], + "overwrites": { + "name": "HttpClient.send_request", + "target": 1930, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Stream an HTTP request via the client.\n\nThis method should be used for downloading potentially large data where you need to process\nthe response body in chunks rather than loading it entirely into memory.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2204, + "module": "http_clients._base", + "name": "stream", + "parsedDocstring": { + "text": "Stream an HTTP request via the client.\n\nThis method should be used for downloading potentially large data where you need to process\nthe response body in chunks rather than loading it entirely into memory.\n", + "args": { + "url": "The URL to send the request to.", + "method": "The HTTP method to use.", + "headers": "The headers to include in the request.", + "payload": "The data to be sent as the request body.", + "session": "The session associated with the request.", + "proxy_info": "The information about the proxy to be used.", + "timeout": "The maximum time to wait for establishing the connection.\n" + }, + "returns": "An async context manager yielding the HTTP response with streaming capabilities." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_playwright/_playwright_http_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 96 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "An async context manager yielding the HTTP response with streaming capabilities." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Stream an HTTP request via the client.\n\nThis method should be used for downloading potentially large data where you need to process\nthe response body in chunks rather than loading it entirely into memory.\n" + } + ] + }, + "flags": {}, + "id": 1939, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "stream", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The URL to send the request to." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1940, + "kind": 32768, + "kindString": "Parameter", + "name": "url", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The HTTP method to use." + } + ] + }, + "defaultValue": "'GET'", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1941, + "kind": 32768, + "kindString": "Parameter", + "name": "method", + "type": { + "name": "Literal", + "type": "reference", + "typeArguments": [ + { + "type": "literal", + "value": "GET" + }, + { + "type": "literal", + "value": "HEAD" + }, + { + "type": "literal", + "value": "POST" + }, + { + "type": "literal", + "value": "PUT" + }, + { + "type": "literal", + "value": "DELETE" + }, + { + "type": "literal", + "value": "CONNECT" + }, + { + "type": "literal", + "value": "OPTIONS" + }, + { + "type": "literal", + "value": "TRACE" + }, + { + "type": "literal", + "value": "PATCH" + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The headers to include in the request." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1942, + "kind": 32768, + "kindString": "Parameter", + "name": "headers", + "type": { + "name": "HttpHeaders | dict[str, str] | None", + "type": "union", + "types": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "HttpHeaders", + "target": "123" + }, + { + "type": "reference", + "name": "dict", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "str" + } + ] + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The data to be sent as the request body." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1943, + "kind": 32768, + "kindString": "Parameter", + "name": "payload", + "type": { + "name": "HttpPayload | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "bytes", + "target": "3048" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The session associated with the request." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1944, + "kind": 32768, + "kindString": "Parameter", + "name": "session", + "type": { + "name": "Session | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Session", + "target": "1575" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The information about the proxy to be used." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1945, + "kind": 32768, + "kindString": "Parameter", + "name": "proxy_info", + "type": { + "name": "ProxyInfo | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ProxyInfo", + "target": "15" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The maximum time to wait for establishing the connection.\n" + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1946, + "kind": 32768, + "kindString": "Parameter", + "name": "timeout", + "type": { + "name": "timedelta | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "timedelta" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "AbstractAsyncContextManager", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "HttpResponse", + "target": "1909" + } + ] + }, + "overwrites": { + "name": "HttpClient.stream", + "target": 1938, + "type": "reference" + } + } + ], + "overwrites": { + "name": "HttpClient.stream", + "target": 1938, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Clean up resources used by the client.\n\nThis method is called when the client is no longer needed and should be overridden\nin subclasses to perform any necessary cleanup such as closing connections,\nreleasing file handles, or other resource deallocation." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2213, + "module": "http_clients._base", + "name": "cleanup", + "parsedDocstring": { + "text": "Clean up resources used by the client.\n\nThis method is called when the client is no longer needed and should be overridden\nin subclasses to perform any necessary cleanup such as closing connections,\nreleasing file handles, or other resource deallocation." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_playwright/_playwright_http_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 109 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Clean up resources used by the client.\n\nThis method is called when the client is no longer needed and should be overridden\nin subclasses to perform any necessary cleanup such as closing connections,\nreleasing file handles, or other resource deallocation." + } + ] + }, + "flags": {}, + "id": 1948, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "cleanup", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "HttpClient.cleanup", + "target": 1947, + "type": "reference" + } + } + ], + "overwrites": { + "name": "HttpClient.cleanup", + "target": 1947, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Indicate whether the context is active." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 3528, + "module": "http_clients._base", + "name": "active", + "parsedDocstring": { + "text": "Indicate whether the context is active." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/http_clients/_base.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 95 + } + ], + "type": { + "name": "bool", + "type": "reference" + }, + "inheritedFrom": { + "name": "HttpClient.active", + "target": 1923, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize the client when entering the context manager.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3529, + "module": "http_clients._base", + "name": "__aenter__", + "parsedDocstring": { + "text": "Initialize the client when entering the context manager.\n" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/http_clients/_base.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 197 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize the client when entering the context manager.\n" + } + ] + }, + "flags": {}, + "id": 1950, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "__aenter__", + "parameters": [], + "type": { + "name": "HttpClient", + "type": "reference", + "target": "1919" + }, + "inheritedFrom": { + "name": "HttpClient.__aenter__", + "target": 1949, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "HttpClient.__aenter__", + "target": 1949, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Deinitialize the client and clean up resources when exiting the context manager.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3530, + "module": "http_clients._base", + "name": "__aexit__", + "parsedDocstring": { + "text": "Deinitialize the client and clean up resources when exiting the context manager.\n" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/http_clients/_base.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 209 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Deinitialize the client and clean up resources when exiting the context manager.\n" + } + ] + }, + "flags": {}, + "id": 1952, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "__aexit__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1953, + "kind": 32768, + "kindString": "Parameter", + "name": "exc_type", + "type": { + "name": "BaseException | None", + "type": "reference" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1954, + "kind": 32768, + "kindString": "Parameter", + "name": "exc_value", + "type": { + "name": "BaseException | None", + "type": "reference" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1955, + "kind": 32768, + "kindString": "Parameter", + "name": "traceback", + "type": { + "name": "TracebackType | None", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "HttpClient.__aexit__", + "target": 1951, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "HttpClient.__aexit__", + "target": 1951, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP client based on the Playwright library.\n\nThis client uses the Playwright library to perform HTTP requests in crawlers (`BasicCrawler` subclasses)\nand to manage sessions, proxies, and error handling.\n\nSee the `HttpClient` class for more common information about HTTP clients.\n\nNote: This class is pre-designated for use in `PlaywrightCrawler` only" + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 3529, + 3530, + 2188, + 2213, + 2190, + 2196, + 2204 + ], + "title": "Methods" + }, + { + "children": [ + 3528 + ], + "title": "Properties" + } + ], + "id": 2187, + "module": "crawlers._playwright._playwright_http_client", + "name": "PlaywrightHttpClient", + "parsedDocstring": { + "text": "HTTP client based on the Playwright library.\n\nThis client uses the Playwright library to perform HTTP requests in crawlers (`BasicCrawler` subclasses)\nand to manage sessions, proxies, and error handling.\n\nSee the `HttpClient` class for more common information about HTTP clients.\n\nNote: This class is pre-designated for use in `PlaywrightCrawler` only" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_playwright/_playwright_http_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 39 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "HttpClient", + "target": "1919", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The Playwright `Response` object containing the response details for the current URL." + } + ] + }, + "flags": {}, + "groups": [], + "id": 2216, + "module": "crawlers._playwright._playwright_crawling_context", + "name": "response", + "parsedDocstring": { + "text": "The Playwright `Response` object containing the response details for the current URL." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_playwright/_playwright_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 26 + } + ], + "type": { + "name": "Response", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The Playwright `EnqueueLinksFunction` implementation." + } + ] + }, + "flags": {}, + "groups": [], + "id": 2217, + "module": "crawlers._playwright._playwright_crawling_context", + "name": "enqueue_links", + "parsedDocstring": { + "text": "The Playwright `EnqueueLinksFunction` implementation." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_playwright/_playwright_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 29 + } + ], + "type": { + "name": "EnqueueLinksFunction", + "type": "reference", + "target": "235" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The Playwright `ExtractLinksFunction` implementation." + } + ] + }, + "flags": {}, + "groups": [], + "id": 2218, + "module": "crawlers._playwright._playwright_crawling_context", + "name": "extract_links", + "parsedDocstring": { + "text": "The Playwright `ExtractLinksFunction` implementation." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_playwright/_playwright_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 32 + } + ], + "type": { + "name": "ExtractLinksFunction", + "type": "reference", + "target": "253" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A function to perform infinite scrolling on the page. This scrolls to the bottom, triggering\nthe loading of additional content if present." + } + ] + }, + "flags": {}, + "groups": [], + "id": 2219, + "module": "crawlers._playwright._playwright_crawling_context", + "name": "infinite_scroll", + "parsedDocstring": { + "text": "A function to perform infinite scrolling on the page. This scrolls to the bottom, triggering\nthe loading of additional content if present." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_playwright/_playwright_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 35 + } + ], + "type": { + "name": "Callable", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "[]" + }, + { + "type": "reference", + "name": "Awaitable", + "typeArguments": [ + { + "type": "literal", + "value": null + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The Playwright `Page` object for the current page." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3504, + "module": "crawlers._playwright._playwright_pre_nav_crawling_context", + "name": "page", + "parsedDocstring": { + "text": "The Playwright `Page` object for the current page." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 23 + } + ], + "type": { + "name": "Page", + "type": "reference" + }, + "inheritedFrom": { + "name": "PlaywrightPreNavCrawlingContext.page", + "target": 2180, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Blocks network requests matching specified URL patterns." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3505, + "module": "crawlers._playwright._playwright_pre_nav_crawling_context", + "name": "block_requests", + "parsedDocstring": { + "text": "Blocks network requests matching specified URL patterns." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 26 + } + ], + "type": { + "name": "BlockRequestsFunction", + "type": "reference", + "target": "2162" + }, + "inheritedFrom": { + "name": "PlaywrightPreNavCrawlingContext.block_requests", + "target": 2181, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get snapshot of crawled page." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3506, + "module": "crawlers._playwright._playwright_pre_nav_crawling_context", + "name": "get_snapshot", + "parsedDocstring": { + "text": "Get snapshot of crawled page." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 29 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get snapshot of crawled page." + } + ] + }, + "flags": {}, + "id": 2183, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_snapshot", + "parameters": [], + "type": { + "name": "PageSnapshot", + "type": "reference", + "target": "297" + }, + "overwrites": { + "name": "BasicCrawlingContext.get_snapshot", + "target": 319, + "type": "reference" + }, + "inheritedFrom": { + "name": "PlaywrightPreNavCrawlingContext.get_snapshot", + "target": 2182, + "type": "reference" + } + } + ], + "overwrites": { + "name": "BasicCrawlingContext.get_snapshot", + "target": 319, + "type": "reference" + }, + "inheritedFrom": { + "name": "PlaywrightPreNavCrawlingContext.get_snapshot", + "target": 2182, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Request object for the current page being processed." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3507, + "module": "_types", + "name": "request", + "parsedDocstring": { + "text": "Request object for the current page being processed." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 627 + } + ], + "type": { + "name": "Request", + "type": "reference", + "target": "398" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.request", + "target": 310, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Session object for the current page being processed." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3508, + "module": "_types", + "name": "session", + "parsedDocstring": { + "text": "Session object for the current page being processed." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 630 + } + ], + "type": { + "name": "Session | None", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.session", + "target": 311, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Proxy information for the current page being processed." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3509, + "module": "_types", + "name": "proxy_info", + "parsedDocstring": { + "text": "Proxy information for the current page being processed." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 633 + } + ], + "type": { + "name": "ProxyInfo | None", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.proxy_info", + "target": 312, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Send request crawling context helper function." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3510, + "module": "_types", + "name": "send_request", + "parsedDocstring": { + "text": "Send request crawling context helper function." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 636 + } + ], + "type": { + "name": "SendRequestFunction", + "type": "reference", + "target": "290" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.send_request", + "target": 313, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Add requests crawling context helper function." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3511, + "module": "_types", + "name": "add_requests", + "parsedDocstring": { + "text": "Add requests crawling context helper function." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 639 + } + ], + "type": { + "name": "AddRequestsFunction", + "type": "reference", + "target": "230" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.add_requests", + "target": 314, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Push data crawling context helper function." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3512, + "module": "_types", + "name": "push_data", + "parsedDocstring": { + "text": "Push data crawling context helper function." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 642 + } + ], + "type": { + "name": "PushDataFunction", + "type": "reference", + "target": "283" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.push_data", + "target": 315, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Use state crawling context helper function." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3513, + "module": "_types", + "name": "use_state", + "parsedDocstring": { + "text": "Use state crawling context helper function." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 645 + } + ], + "type": { + "name": "UseStateFunction", + "type": "reference", + "target": "305" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.use_state", + "target": 316, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get key-value store crawling context helper function." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3514, + "module": "_types", + "name": "get_key_value_store", + "parsedDocstring": { + "text": "Get key-value store crawling context helper function." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 648 + } + ], + "type": { + "name": "GetKeyValueStoreFromRequestHandlerFunction", + "type": "reference", + "target": "278" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.get_key_value_store", + "target": 317, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Logger instance." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3515, + "module": "_types", + "name": "log", + "parsedDocstring": { + "text": "Logger instance." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 651 + } + ], + "type": { + "name": "logging.Logger", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.log", + "target": 318, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return hash of the context. Each context is considered unique." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3516, + "module": "_types", + "name": "__hash__", + "parsedDocstring": { + "text": "Return hash of the context. Each context is considered unique." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 658 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return hash of the context. Each context is considered unique." + } + ] + }, + "flags": {}, + "id": 322, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__hash__", + "parameters": [], + "type": { + "name": "int", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.__hash__", + "target": 321, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawlingContext.__hash__", + "target": 321, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The crawling context used by the `PlaywrightCrawler`.\n\nIt provides access to key objects as well as utility functions for handling crawling tasks." + } + ] + }, + "decorations": [ + { + "args": "(frozen=True)", + "name": "dataclass" + }, + { + "args": "('Data structures')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 3516, + 3506 + ], + "title": "Methods" + }, + { + "children": [ + 3511, + 3505, + 2217, + 2218, + 3514, + 2219, + 3515, + 3504, + 3509, + 3512, + 3507, + 2216, + 3510, + 3508, + 3513 + ], + "title": "Properties" + } + ], + "id": 2215, + "module": "crawlers._playwright._playwright_crawling_context", + "name": "PlaywrightCrawlingContext", + "parsedDocstring": { + "text": "The crawling context used by the `PlaywrightCrawler`.\n\nIt provides access to key objects as well as utility functions for handling crawling tasks." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_playwright/_playwright_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 20 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "PlaywrightPreNavCrawlingContext", + "target": "2179", + "type": "reference" + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2220, + "module": "crawlers._playwright._playwright_crawler", + "name": "TCrawlingContext", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_playwright/_playwright_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 32 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2221, + "module": "crawlers._playwright._playwright_crawler", + "name": "TStatisticsState", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_playwright/_playwright_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 33 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2223, + "module": "crawlers._playwright._playwright_crawler", + "name": "__init__", + "parsedDocstring": { + "text": "Initialize a new instance.\n", + "args": { + "browser_pool": "A `BrowserPool` instance to be used for launching the browsers and getting pages.", + "user_data_dir": "Path to a user data directory, which stores browser session data like cookies\nand local storage.", + "browser_type": "The type of browser to launch ('chromium', 'firefox', or 'webkit').\nThis option should not be used if `browser_pool` is provided.", + "browser_launch_options": "Keyword arguments to pass to the browser launch method. These options are provided\ndirectly to Playwright's `browser_type.launch` method. For more details, refer to the\n[Playwright documentation](https://playwright.dev/python/docs/api/class-browsertype#browser-type-launch).\nThis option should not be used if `browser_pool` is provided.", + "browser_new_context_options": "Keyword arguments to pass to the browser new context method. These options\nare provided directly to Playwright's `browser.new_context` method. For more details, refer to the\n[Playwright documentation](https://playwright.dev/python/docs/api/class-browser#browser-new-context).\nThis option should not be used if `browser_pool` is provided.", + "fingerprint_generator": "An optional instance of implementation of `FingerprintGenerator` that is used\nto generate browser fingerprints together with consistent headers.", + "headless": "Whether to run the browser in headless mode.\nThis option should not be used if `browser_pool` is provided.", + "use_incognito_pages": "By default pages share the same browser context. If set to True each page uses its\nown context that is destroyed once the page is closed or crashes.\nThis option should not be used if `browser_pool` is provided.", + "kwargs": "Additional keyword arguments to pass to the underlying `BasicCrawler`." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_playwright/_playwright_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 96 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n" + } + ] + }, + "flags": {}, + "id": 2224, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A `BrowserPool` instance to be used for launching the browsers and getting pages." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2225, + "kind": 32768, + "kindString": "Parameter", + "name": "browser_pool", + "type": { + "name": "BrowserPool | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "BrowserPool", + "target": "2763" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The type of browser to launch ('chromium', 'firefox', or 'webkit').\nThis option should not be used if `browser_pool` is provided." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2226, + "kind": 32768, + "kindString": "Parameter", + "name": "browser_type", + "type": { + "name": "BrowserType | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "BrowserType", + "target": "2677" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Path to a user data directory, which stores browser session data like cookies\nand local storage." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2227, + "kind": 32768, + "kindString": "Parameter", + "name": "user_data_dir", + "type": { + "name": "str | Path | None", + "type": "union", + "types": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Path" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Keyword arguments to pass to the browser launch method. These options are provided\ndirectly to Playwright's `browser_type.launch` method. For more details, refer to the\n[Playwright documentation](https://playwright.dev/python/docs/api/class-browsertype#browser-type-launch).\nThis option should not be used if `browser_pool` is provided." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2228, + "kind": 32768, + "kindString": "Parameter", + "name": "browser_launch_options", + "type": { + "name": "Mapping[str, Any] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Mapping", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Any" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Keyword arguments to pass to the browser new context method. These options\nare provided directly to Playwright's `browser.new_context` method. For more details, refer to the\n[Playwright documentation](https://playwright.dev/python/docs/api/class-browser#browser-new-context).\nThis option should not be used if `browser_pool` is provided." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2229, + "kind": 32768, + "kindString": "Parameter", + "name": "browser_new_context_options", + "type": { + "name": "Mapping[str, Any] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Mapping", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Any" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "An optional instance of implementation of `FingerprintGenerator` that is used\nto generate browser fingerprints together with consistent headers." + } + ] + }, + "defaultValue": "'default'", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2230, + "kind": 32768, + "kindString": "Parameter", + "name": "fingerprint_generator", + "type": { + "name": "FingerprintGenerator | None | Literal['default']", + "type": "union", + "types": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "FingerprintGenerator", + "target": "1991" + }, + { + "type": "literal", + "value": null + } + ] + }, + { + "type": "reference", + "name": "Literal", + "typeArguments": [ + { + "type": "literal", + "value": "default" + } + ] + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Whether to run the browser in headless mode.\nThis option should not be used if `browser_pool` is provided." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2231, + "kind": 32768, + "kindString": "Parameter", + "name": "headless", + "type": { + "name": "bool | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "bool" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "By default pages share the same browser context. If set to True each page uses its\nown context that is destroyed once the page is closed or crashes.\nThis option should not be used if `browser_pool` is provided." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2232, + "kind": 32768, + "kindString": "Parameter", + "name": "use_incognito_pages", + "type": { + "name": "bool | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "bool" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A callable responsible for handling requests." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 3252, + "module": "crawlers._basic._basic_crawler", + "name": "request_handler", + "parsedDocstring": { + "text": "A callable responsible for handling requests." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 196 + } + ], + "type": { + "name": "NotRequired[Callable[[TCrawlingContext], Awaitable[None]]]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptionsGeneric.request_handler", + "target": 2391, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A custom `Statistics` instance, allowing the use of non-default configuration." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 3253, + "module": "crawlers._basic._basic_crawler", + "name": "statistics", + "parsedDocstring": { + "text": "A custom `Statistics` instance, allowing the use of non-default configuration." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 203 + } + ], + "type": { + "name": "NotRequired[Statistics[TStatisticsState]]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptionsGeneric.statistics", + "target": 2392, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The `Configuration` instance. Some of its properties are used as defaults for the crawler." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 3254, + "module": "crawlers._basic._basic_crawler", + "name": "configuration", + "parsedDocstring": { + "text": "The `Configuration` instance. Some of its properties are used as defaults for the crawler." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 101 + } + ], + "type": { + "name": "NotRequired[Configuration]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.configuration", + "target": 2368, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The event manager for managing events for the crawler and all its components." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 3255, + "module": "crawlers._basic._basic_crawler", + "name": "event_manager", + "parsedDocstring": { + "text": "The event manager for managing events for the crawler and all its components." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 104 + } + ], + "type": { + "name": "NotRequired[EventManager]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.event_manager", + "target": 2369, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The storage client for managing storages for the crawler and all its components." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 3256, + "module": "crawlers._basic._basic_crawler", + "name": "storage_client", + "parsedDocstring": { + "text": "The storage client for managing storages for the crawler and all its components." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 107 + } + ], + "type": { + "name": "NotRequired[StorageClient]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.storage_client", + "target": 2370, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Manager of requests that should be processed by the crawler." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 3257, + "module": "crawlers._basic._basic_crawler", + "name": "request_manager", + "parsedDocstring": { + "text": "Manager of requests that should be processed by the crawler." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 110 + } + ], + "type": { + "name": "NotRequired[RequestManager]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.request_manager", + "target": 2371, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A custom `SessionPool` instance, allowing the use of non-default configuration." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 3258, + "module": "crawlers._basic._basic_crawler", + "name": "session_pool", + "parsedDocstring": { + "text": "A custom `SessionPool` instance, allowing the use of non-default configuration." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 113 + } + ], + "type": { + "name": "NotRequired[SessionPool]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.session_pool", + "target": 2372, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP proxy configuration used when making requests." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 3259, + "module": "crawlers._basic._basic_crawler", + "name": "proxy_configuration", + "parsedDocstring": { + "text": "HTTP proxy configuration used when making requests." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 116 + } + ], + "type": { + "name": "NotRequired[ProxyConfiguration]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.proxy_configuration", + "target": 2373, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP client used by `BasicCrawlingContext.send_request` method." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 3260, + "module": "crawlers._basic._basic_crawler", + "name": "http_client", + "parsedDocstring": { + "text": "HTTP client used by `BasicCrawlingContext.send_request` method." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 119 + } + ], + "type": { + "name": "NotRequired[HttpClient]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.http_client", + "target": 2374, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Specifies the maximum number of retries allowed for a request if its processing fails.\nThis includes retries due to navigation errors or errors thrown from user-supplied functions\n(`request_handler`, `pre_navigation_hooks` etc.).\n\nThis limit does not apply to retries triggered by session rotation (see `max_session_rotations`)." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 3261, + "module": "crawlers._basic._basic_crawler", + "name": "max_request_retries", + "parsedDocstring": { + "text": "Specifies the maximum number of retries allowed for a request if its processing fails.\nThis includes retries due to navigation errors or errors thrown from user-supplied functions\n(`request_handler`, `pre_navigation_hooks` etc.).\n\nThis limit does not apply to retries triggered by session rotation (see `max_session_rotations`)." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 122 + } + ], + "type": { + "name": "NotRequired[int]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.max_request_retries", + "target": 2375, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum number of pages to open during a crawl. The crawl stops upon reaching this limit.\nSetting this value can help avoid infinite loops in misconfigured crawlers. `None` means no limit.\nDue to concurrency settings, the actual number of pages visited may slightly exceed this value." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 3262, + "module": "crawlers._basic._basic_crawler", + "name": "max_requests_per_crawl", + "parsedDocstring": { + "text": "Maximum number of pages to open during a crawl. The crawl stops upon reaching this limit.\nSetting this value can help avoid infinite loops in misconfigured crawlers. `None` means no limit.\nDue to concurrency settings, the actual number of pages visited may slightly exceed this value." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 129 + } + ], + "type": { + "name": "NotRequired[int | None]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.max_requests_per_crawl", + "target": 2376, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum number of session rotations per request. The crawler rotates the session if a proxy error occurs\nor if the website blocks the request.\n\nThe session rotations are not counted towards the `max_request_retries` limit." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 3263, + "module": "crawlers._basic._basic_crawler", + "name": "max_session_rotations", + "parsedDocstring": { + "text": "Maximum number of session rotations per request. The crawler rotates the session if a proxy error occurs\nor if the website blocks the request.\n\nThe session rotations are not counted towards the `max_request_retries` limit." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 134 + } + ], + "type": { + "name": "NotRequired[int]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.max_session_rotations", + "target": 2377, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Specifies the maximum crawl depth. If set, the crawler will stop processing links beyond this depth.\nThe crawl depth starts at 0 for initial requests and increases with each subsequent level of links.\nRequests at the maximum depth will still be processed, but no new links will be enqueued from those requests.\nIf not set, crawling continues without depth restrictions." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 3264, + "module": "crawlers._basic._basic_crawler", + "name": "max_crawl_depth", + "parsedDocstring": { + "text": "Specifies the maximum crawl depth. If set, the crawler will stop processing links beyond this depth.\nThe crawl depth starts at 0 for initial requests and increases with each subsequent level of links.\nRequests at the maximum depth will still be processed, but no new links will be enqueued from those requests.\nIf not set, crawling continues without depth restrictions." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 141 + } + ], + "type": { + "name": "NotRequired[int | None]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.max_crawl_depth", + "target": 2378, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Enable the use of a session pool for managing sessions during crawling." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 3265, + "module": "crawlers._basic._basic_crawler", + "name": "use_session_pool", + "parsedDocstring": { + "text": "Enable the use of a session pool for managing sessions during crawling." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 148 + } + ], + "type": { + "name": "NotRequired[bool]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.use_session_pool", + "target": 2379, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, the crawler attempts to bypass bot protections automatically." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 3266, + "module": "crawlers._basic._basic_crawler", + "name": "retry_on_blocked", + "parsedDocstring": { + "text": "If True, the crawler attempts to bypass bot protections automatically." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 151 + } + ], + "type": { + "name": "NotRequired[bool]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.retry_on_blocked", + "target": 2380, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Settings to fine-tune concurrency levels." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 3267, + "module": "crawlers._basic._basic_crawler", + "name": "concurrency_settings", + "parsedDocstring": { + "text": "Settings to fine-tune concurrency levels." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 154 + } + ], + "type": { + "name": "NotRequired[ConcurrencySettings]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.concurrency_settings", + "target": 2381, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum duration allowed for a single request handler to run." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 3268, + "module": "crawlers._basic._basic_crawler", + "name": "request_handler_timeout", + "parsedDocstring": { + "text": "Maximum duration allowed for a single request handler to run." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 157 + } + ], + "type": { + "name": "NotRequired[timedelta]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.request_handler_timeout", + "target": 2382, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, the crawler stops immediately when any request handler error occurs." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 3269, + "module": "crawlers._basic._basic_crawler", + "name": "abort_on_error", + "parsedDocstring": { + "text": "If True, the crawler stops immediately when any request handler error occurs." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 160 + } + ], + "type": { + "name": "NotRequired[bool]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.abort_on_error", + "target": 2383, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, the crawler will set up logging infrastructure automatically." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 3270, + "module": "crawlers._basic._basic_crawler", + "name": "configure_logging", + "parsedDocstring": { + "text": "If True, the crawler will set up logging infrastructure automatically." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 163 + } + ], + "type": { + "name": "NotRequired[bool]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.configure_logging", + "target": 2384, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If 'table', displays crawler statistics as formatted tables in logs. If 'inline', outputs statistics as plain\ntext log messages." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 3271, + "module": "crawlers._basic._basic_crawler", + "name": "statistics_log_format", + "parsedDocstring": { + "text": "If 'table', displays crawler statistics as formatted tables in logs. If 'inline', outputs statistics as plain\ntext log messages." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 166 + } + ], + "type": { + "name": "NotRequired[Literal['table', 'inline']]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.statistics_log_format", + "target": 2385, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Flag that can keep crawler running even when there are no requests in queue." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 3272, + "module": "crawlers._basic._basic_crawler", + "name": "keep_alive", + "parsedDocstring": { + "text": "Flag that can keep crawler running even when there are no requests in queue." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 171 + } + ], + "type": { + "name": "NotRequired[bool]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.keep_alive", + "target": 2386, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Additional HTTP status codes to treat as errors, triggering automatic retries when encountered." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 3273, + "module": "crawlers._basic._basic_crawler", + "name": "additional_http_error_status_codes", + "parsedDocstring": { + "text": "Additional HTTP status codes to treat as errors, triggering automatic retries when encountered." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 174 + } + ], + "type": { + "name": "NotRequired[Iterable[int]]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.additional_http_error_status_codes", + "target": 2387, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP status codes that are typically considered errors but should be treated as successful responses." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 3274, + "module": "crawlers._basic._basic_crawler", + "name": "ignore_http_error_status_codes", + "parsedDocstring": { + "text": "HTTP status codes that are typically considered errors but should be treated as successful responses." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 177 + } + ], + "type": { + "name": "NotRequired[Iterable[int]]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.ignore_http_error_status_codes", + "target": 2388, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If set to `True`, the crawler will automatically try to fetch the robots.txt file for each domain,\nand skip those that are not allowed. This also prevents disallowed URLs to be added via `EnqueueLinksFunction`." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 3275, + "module": "crawlers._basic._basic_crawler", + "name": "respect_robots_txt_file", + "parsedDocstring": { + "text": "If set to `True`, the crawler will automatically try to fetch the robots.txt file for each domain,\nand skip those that are not allowed. This also prevents disallowed URLs to be added via `EnqueueLinksFunction`." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 188 + } + ], + "type": { + "name": "NotRequired[bool]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.respect_robots_txt_file", + "target": 2389, + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "BasicCrawler.__init__", + "target": 2395, + "type": "reference" + } + } + ], + "overwrites": { + "name": "BasicCrawler.__init__", + "target": 2395, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a hook to be called before each navigation.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2234, + "module": "crawlers._playwright._playwright_crawler", + "name": "pre_navigation_hook", + "parsedDocstring": { + "text": "Register a hook to be called before each navigation.\n", + "args": { + "hook": "A coroutine function to be called before each navigation." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_playwright/_playwright_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 448 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a hook to be called before each navigation.\n" + } + ] + }, + "flags": {}, + "id": 2235, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "pre_navigation_hook", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A coroutine function to be called before each navigation." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2236, + "kind": 32768, + "kindString": "Parameter", + "name": "hook", + "type": { + "name": "Callable", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "[PlaywrightPreNavCrawlingContext]" + }, + { + "type": "reference", + "name": "Awaitable", + "typeArguments": [ + { + "type": "literal", + "value": null + } + ] + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The logger used by the crawler." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 3307, + "module": "crawlers._basic._basic_crawler", + "name": "log", + "parsedDocstring": { + "text": "The logger used by the crawler." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 440 + } + ], + "type": { + "name": "logging.Logger", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.log", + "target": 2424, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The `Router` used to handle each individual crawling request." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 3308, + "module": "crawlers._basic._basic_crawler", + "name": "router", + "parsedDocstring": { + "text": "The `Router` used to handle each individual crawling request." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 445 + } + ], + "type": { + "name": "Router[TCrawlingContext]", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.router", + "target": 2425, + "type": "reference" + }, + "overwrites": { + "name": "BasicCrawler.router", + "target": 2426, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Statistics about the current (or last) crawler run." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 3309, + "module": "crawlers._basic._basic_crawler", + "name": "statistics", + "parsedDocstring": { + "text": "Statistics about the current (or last) crawler run." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 460 + } + ], + "type": { + "name": "Statistics[TStatisticsState]", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.statistics", + "target": 2429, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Set flag to stop crawler.\n\nThis stops current crawler run regardless of whether all requests were finished.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3310, + "module": "crawlers._basic._basic_crawler", + "name": "stop", + "parsedDocstring": { + "text": "Set flag to stop crawler.\n\nThis stops current crawler run regardless of whether all requests were finished.\n", + "args": { + "reason": "Reason for stopping that will be used in logs." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 464 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Set flag to stop crawler.\n\nThis stops current crawler run regardless of whether all requests were finished.\n" + } + ] + }, + "flags": {}, + "id": 2431, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "stop", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Reason for stopping that will be used in logs." + } + ] + }, + "defaultValue": "'Stop was called externally.'", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 2432, + "kind": 32768, + "kindString": "Parameter", + "name": "reason", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.stop", + "target": 2430, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.stop", + "target": 2430, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the configured request manager. If none is configured, open and return the default request queue." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3311, + "module": "crawlers._basic._basic_crawler", + "name": "get_request_manager", + "parsedDocstring": { + "text": "Return the configured request manager. If none is configured, open and return the default request queue." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 524 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the configured request manager. If none is configured, open and return the default request queue." + } + ] + }, + "flags": {}, + "id": 2434, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_request_manager", + "parameters": [], + "type": { + "name": "RequestManager", + "type": "reference", + "target": "1752" + }, + "inheritedFrom": { + "name": "BasicCrawler.get_request_manager", + "target": 2433, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.get_request_manager", + "target": 2433, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the `Dataset` with the given ID or name. If none is provided, return the default one." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3312, + "module": "crawlers._basic._basic_crawler", + "name": "get_dataset", + "parsedDocstring": { + "text": "Return the `Dataset` with the given ID or name. If none is provided, return the default one." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 531 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the `Dataset` with the given ID or name. If none is provided, return the default one." + } + ] + }, + "flags": {}, + "id": 2436, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_dataset", + "parameters": [ + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2437, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2438, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str | None", + "type": "reference" + } + } + ], + "type": { + "name": "Dataset", + "type": "reference", + "target": "662" + }, + "inheritedFrom": { + "name": "BasicCrawler.get_dataset", + "target": 2435, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.get_dataset", + "target": 2435, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the `KeyValueStore` with the given ID or name. If none is provided, return the default KVS." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3313, + "module": "crawlers._basic._basic_crawler", + "name": "get_key_value_store", + "parsedDocstring": { + "text": "Return the `KeyValueStore` with the given ID or name. If none is provided, return the default KVS." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 540 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the `KeyValueStore` with the given ID or name. If none is provided, return the default KVS." + } + ] + }, + "flags": {}, + "id": 2440, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_key_value_store", + "parameters": [ + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2441, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2442, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str | None", + "type": "reference" + } + } + ], + "type": { + "name": "KeyValueStore", + "type": "reference", + "target": "569" + }, + "inheritedFrom": { + "name": "BasicCrawler.get_key_value_store", + "target": 2439, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.get_key_value_store", + "target": 2439, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a function to handle errors occurring in request handlers.\n\nThe error handler is invoked after a request handler error occurs and before a retry attempt." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3314, + "module": "crawlers._basic._basic_crawler", + "name": "error_handler", + "parsedDocstring": { + "text": "Register a function to handle errors occurring in request handlers.\n\nThe error handler is invoked after a request handler error occurs and before a retry attempt." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 549 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a function to handle errors occurring in request handlers.\n\nThe error handler is invoked after a request handler error occurs and before a retry attempt." + } + ] + }, + "flags": {}, + "id": 2444, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "error_handler", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2445, + "kind": 32768, + "kindString": "Parameter", + "name": "handler", + "type": { + "name": "ErrorHandler[TCrawlingContext | BasicCrawlingContext]", + "type": "reference" + } + } + ], + "type": { + "name": "ErrorHandler[TCrawlingContext]", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.error_handler", + "target": 2443, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.error_handler", + "target": 2443, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a function to handle requests that exceed the maximum retry limit.\n\nThe failed request handler is invoked when a request has failed all retry attempts." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3315, + "module": "crawlers._basic._basic_crawler", + "name": "failed_request_handler", + "parsedDocstring": { + "text": "Register a function to handle requests that exceed the maximum retry limit.\n\nThe failed request handler is invoked when a request has failed all retry attempts." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 559 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a function to handle requests that exceed the maximum retry limit.\n\nThe failed request handler is invoked when a request has failed all retry attempts." + } + ] + }, + "flags": {}, + "id": 2447, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "failed_request_handler", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2448, + "kind": 32768, + "kindString": "Parameter", + "name": "handler", + "type": { + "name": "FailedRequestHandler[TCrawlingContext | BasicCrawlingContext]", + "type": "reference" + } + } + ], + "type": { + "name": "FailedRequestHandler[TCrawlingContext]", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.failed_request_handler", + "target": 2446, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.failed_request_handler", + "target": 2446, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a function to handle skipped requests.\n\nThe skipped request handler is invoked when a request is skipped due to a collision or other reasons." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3316, + "module": "crawlers._basic._basic_crawler", + "name": "on_skipped_request", + "parsedDocstring": { + "text": "Register a function to handle skipped requests.\n\nThe skipped request handler is invoked when a request is skipped due to a collision or other reasons." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 569 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a function to handle skipped requests.\n\nThe skipped request handler is invoked when a request is skipped due to a collision or other reasons." + } + ] + }, + "flags": {}, + "id": 2450, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "on_skipped_request", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2451, + "kind": 32768, + "kindString": "Parameter", + "name": "callback", + "type": { + "name": "SkippedRequestCallback", + "type": "reference", + "target": "2366" + } + } + ], + "type": { + "name": "SkippedRequestCallback", + "type": "reference", + "target": "2366" + }, + "inheritedFrom": { + "name": "BasicCrawler.on_skipped_request", + "target": 2449, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.on_skipped_request", + "target": 2449, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Run the crawler until all requests are processed.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3317, + "module": "crawlers._basic._basic_crawler", + "name": "run", + "parsedDocstring": { + "text": "Run the crawler until all requests are processed.\n", + "args": { + "requests": "The requests to be enqueued before the crawler starts.", + "purge_request_queue": "If this is `True` and the crawler is not being run for the first time, the default\nrequest queue will be purged." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 577 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Run the crawler until all requests are processed.\n" + } + ] + }, + "flags": {}, + "id": 2453, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "run", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The requests to be enqueued before the crawler starts." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 2454, + "kind": 32768, + "kindString": "Parameter", + "name": "requests", + "type": { + "name": "Sequence[str | Request] | None", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "If this is `True` and the crawler is not being run for the first time, the default\nrequest queue will be purged." + } + ] + }, + "defaultValue": "True", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2455, + "kind": 32768, + "kindString": "Parameter", + "name": "purge_request_queue", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "FinalStatistics", + "type": "reference", + "target": "1453" + }, + "inheritedFrom": { + "name": "BasicCrawler.run", + "target": 2452, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.run", + "target": 2452, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Add requests to the underlying request manager in batches.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3318, + "module": "crawlers._basic._basic_crawler", + "name": "add_requests", + "parsedDocstring": { + "text": "Add requests to the underlying request manager in batches.\n", + "args": { + "requests": "A list of requests to add to the queue.", + "batch_size": "The number of requests to add in one batch.", + "wait_time_between_batches": "Time to wait between adding batches.", + "wait_for_all_requests_to_be_added": "If True, wait for all requests to be added before returning.", + "wait_for_all_requests_to_be_added_timeout": "Timeout for waiting for all requests to be added." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 685 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Add requests to the underlying request manager in batches.\n" + } + ] + }, + "flags": {}, + "id": 2457, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "add_requests", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A list of requests to add to the queue." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2458, + "kind": 32768, + "kindString": "Parameter", + "name": "requests", + "type": { + "name": "Sequence[str | Request]", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The number of requests to add in one batch." + } + ] + }, + "defaultValue": "1000", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2459, + "kind": 32768, + "kindString": "Parameter", + "name": "batch_size", + "type": { + "name": "int", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Time to wait between adding batches." + } + ] + }, + "defaultValue": "timedelta(0)", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2460, + "kind": 32768, + "kindString": "Parameter", + "name": "wait_time_between_batches", + "type": { + "name": "timedelta", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, wait for all requests to be added before returning." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2461, + "kind": 32768, + "kindString": "Parameter", + "name": "wait_for_all_requests_to_be_added", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Timeout for waiting for all requests to be added." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2462, + "kind": 32768, + "kindString": "Parameter", + "name": "wait_for_all_requests_to_be_added_timeout", + "type": { + "name": "timedelta | None", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.add_requests", + "target": 2456, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.add_requests", + "target": 2456, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Retrieve data from a `Dataset`.\n\nThis helper method simplifies the process of retrieving data from a `Dataset`. It opens the specified\none and then retrieves the data based on the provided parameters.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3319, + "module": "crawlers._basic._basic_crawler", + "name": "get_data", + "parsedDocstring": { + "text": "Retrieve data from a `Dataset`.\n\nThis helper method simplifies the process of retrieving data from a `Dataset`. It opens the specified\none and then retrieves the data based on the provided parameters.\n", + "args": { + "dataset_id": "The ID of the `Dataset`.", + "dataset_name": "The name of the `Dataset`.", + "kwargs": "Keyword arguments to be passed to the `Dataset.get_data()` method.\n" + }, + "returns": "The retrieved data." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 738 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "The retrieved data." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Retrieve data from a `Dataset`.\n\nThis helper method simplifies the process of retrieving data from a `Dataset`. It opens the specified\none and then retrieves the data based on the provided parameters.\n" + } + ] + }, + "flags": {}, + "id": 2464, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_data", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The ID of the `Dataset`." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 2465, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_id", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The name of the `Dataset`." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 2466, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_name", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Keyword arguments to be passed to the `Dataset.get_data()` method.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2467, + "kind": 32768, + "kindString": "Parameter", + "name": "kwargs", + "type": { + "name": "Unpack[GetDataKwargs]", + "type": "reference" + } + } + ], + "type": { + "name": "DatasetItemsListPage", + "type": "reference", + "target": "823" + }, + "inheritedFrom": { + "name": "BasicCrawler.get_data", + "target": 2463, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.get_data", + "target": 2463, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Export data from a `Dataset`.\n\nThis helper method simplifies the process of exporting data from a `Dataset`. It opens the specified\none and then exports the data based on the provided parameters. If you need to pass options\nspecific to the output format, use the `export_data_csv` or `export_data_json` method instead.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3320, + "module": "crawlers._basic._basic_crawler", + "name": "export_data", + "parsedDocstring": { + "text": "Export data from a `Dataset`.\n\nThis helper method simplifies the process of exporting data from a `Dataset`. It opens the specified\none and then exports the data based on the provided parameters. If you need to pass options\nspecific to the output format, use the `export_data_csv` or `export_data_json` method instead.\n", + "args": { + "path": "The destination path.", + "dataset_id": "The ID of the `Dataset`.", + "dataset_name": "The name of the `Dataset`." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 760 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Export data from a `Dataset`.\n\nThis helper method simplifies the process of exporting data from a `Dataset`. It opens the specified\none and then exports the data based on the provided parameters. If you need to pass options\nspecific to the output format, use the `export_data_csv` or `export_data_json` method instead.\n" + } + ] + }, + "flags": {}, + "id": 2469, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "export_data", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The destination path." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2470, + "kind": 32768, + "kindString": "Parameter", + "name": "path", + "type": { + "name": "str | Path", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The ID of the `Dataset`." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 2471, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_id", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The name of the `Dataset`." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 2472, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_name", + "type": { + "name": "str | None", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.export_data", + "target": 2468, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.export_data", + "target": 2468, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Export data from a `Dataset` to a CSV file.\n\nThis helper method simplifies the process of exporting data from a `Dataset` in csv format. It opens\nthe specified one and then exports the data based on the provided parameters.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3321, + "module": "crawlers._basic._basic_crawler", + "name": "export_data_csv", + "parsedDocstring": { + "text": "Export data from a `Dataset` to a CSV file.\n\nThis helper method simplifies the process of exporting data from a `Dataset` in csv format. It opens\nthe specified one and then exports the data based on the provided parameters.\n", + "args": { + "path": "The destination path.", + "content_type": "The output format.", + "dataset_id": "The ID of the `Dataset`.", + "dataset_name": "The name of the `Dataset`.", + "kwargs": "Extra configurations for dumping/writing in csv format." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 789 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Export data from a `Dataset` to a CSV file.\n\nThis helper method simplifies the process of exporting data from a `Dataset` in csv format. It opens\nthe specified one and then exports the data based on the provided parameters.\n" + } + ] + }, + "flags": {}, + "id": 2474, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "export_data_csv", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The destination path." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2475, + "kind": 32768, + "kindString": "Parameter", + "name": "path", + "type": { + "name": "str | Path", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The ID of the `Dataset`." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2476, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_id", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The name of the `Dataset`." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2477, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_name", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Extra configurations for dumping/writing in csv format." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2478, + "kind": 32768, + "kindString": "Parameter", + "name": "kwargs", + "type": { + "name": "Unpack[ExportDataCsvKwargs]", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.export_data_csv", + "target": 2473, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.export_data_csv", + "target": 2473, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Export data from a `Dataset` to a JSON file.\n\nThis helper method simplifies the process of exporting data from a `Dataset` in json format. It opens the\nspecified one and then exports the data based on the provided parameters.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3322, + "module": "crawlers._basic._basic_crawler", + "name": "export_data_json", + "parsedDocstring": { + "text": "Export data from a `Dataset` to a JSON file.\n\nThis helper method simplifies the process of exporting data from a `Dataset` in json format. It opens the\nspecified one and then exports the data based on the provided parameters.\n", + "args": { + "path": "The destination path", + "dataset_id": "The ID of the `Dataset`.", + "dataset_name": "The name of the `Dataset`.", + "kwargs": "Extra configurations for dumping/writing in json format." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 814 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Export data from a `Dataset` to a JSON file.\n\nThis helper method simplifies the process of exporting data from a `Dataset` in json format. It opens the\nspecified one and then exports the data based on the provided parameters.\n" + } + ] + }, + "flags": {}, + "id": 2480, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "export_data_json", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The destination path" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2481, + "kind": 32768, + "kindString": "Parameter", + "name": "path", + "type": { + "name": "str | Path", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The ID of the `Dataset`." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2482, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_id", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The name of the `Dataset`." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2483, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_name", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Extra configurations for dumping/writing in json format." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2484, + "kind": 32768, + "kindString": "Parameter", + "name": "kwargs", + "type": { + "name": "Unpack[ExportDataJsonKwargs]", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.export_data_json", + "target": 2479, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.export_data_json", + "target": 2479, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A web crawler that leverages the `Playwright` browser automation library.\n\nThe `PlaywrightCrawler` builds on top of the `BasicCrawler`, which means it inherits all of its features.\nOn top of that it provides a high level web crawling interface on top of the `Playwright` library. To be more\nspecific, it uses the Crawlee's `BrowserPool` to manage the Playwright's browser instances and the pages they\nopen. You can create your own `BrowserPool` instance and pass it to the `PlaywrightCrawler` constructor, or let\nthe crawler create a new instance with the default settings.\n\nThis crawler is ideal for crawling websites that require JavaScript execution, as it uses real browsers\nto download web pages and extract data. For websites that do not require JavaScript, consider using one of the\nHTTP client-based crawlers, such as the `HttpCrawler`, `ParselCrawler`, or `BeautifulSoupCrawler`. They use\nraw HTTP requests, which means they are much faster.\n\n### Usage\n\n```python\nfrom crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext\n\ncrawler = PlaywrightCrawler()\n\n# Define the default request handler, which will be called for every request.\n@crawler.router.default_handler\nasync def request_handler(context: PlaywrightCrawlingContext) -> None:\n context.log.info(f'Processing {context.request.url} ...')\n\n # Extract data from the page.\n data = {\n 'url': context.request.url,\n 'title': await context.page.title(),\n 'response': (await context.response.text())[:100],\n }\n\n # Push the extracted data to the default dataset.\n await context.push_data(data)\n\nawait crawler.run(['https://crawlee.dev/'])\n```" + } + ] + }, + "decorations": [ + { + "args": "('Classes')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 2223, + 3318, + 3314, + 3320, + 3321, + 3322, + 3315, + 3319, + 3312, + 3313, + 3311, + 3316, + 2234, + 3317, + 3310 + ], + "title": "Methods" + }, + { + "children": [ + 3307, + 3308, + 3309 + ], + "title": "Properties" + } + ], + "id": 2222, + "module": "crawlers._playwright._playwright_crawler", + "name": "PlaywrightCrawler", + "parsedDocstring": { + "text": "A web crawler that leverages the `Playwright` browser automation library.\n\nThe `PlaywrightCrawler` builds on top of the `BasicCrawler`, which means it inherits all of its features.\nOn top of that it provides a high level web crawling interface on top of the `Playwright` library. To be more\nspecific, it uses the Crawlee's `BrowserPool` to manage the Playwright's browser instances and the pages they\nopen. You can create your own `BrowserPool` instance and pass it to the `PlaywrightCrawler` constructor, or let\nthe crawler create a new instance with the default settings.\n\nThis crawler is ideal for crawling websites that require JavaScript execution, as it uses real browsers\nto download web pages and extract data. For websites that do not require JavaScript, consider using one of the\nHTTP client-based crawlers, such as the `HttpCrawler`, `ParselCrawler`, or `BeautifulSoupCrawler`. They use\nraw HTTP requests, which means they are much faster.\n\n### Usage\n\n```python\nfrom crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext\n\ncrawler = PlaywrightCrawler()\n\n# Define the default request handler, which will be called for every request.\n@crawler.router.default_handler\nasync def request_handler(context: PlaywrightCrawlingContext) -> None:\n context.log.info(f'Processing {context.request.url} ...')\n\n # Extract data from the page.\n data = {\n 'url': context.request.url,\n 'title': await context.page.title(),\n 'response': (await context.response.text())[:100],\n }\n\n # Push the extracted data to the default dataset.\n await context.push_data(data)\n\nawait crawler.run(['https://crawlee.dev/'])\n```" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_playwright/_playwright_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 56 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "BasicCrawler", + "target": "2394", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A `BrowserPool` instance to be used for launching the browsers and getting pages." + } + ] + }, + "flags": {}, + "groups": [], + "id": 2238, + "module": "crawlers._playwright._playwright_crawler", + "name": "browser_pool", + "parsedDocstring": { + "text": "A `BrowserPool` instance to be used for launching the browsers and getting pages." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_playwright/_playwright_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 483 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "BrowserPool", + "target": "2763" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The type of browser to launch ('chromium', 'firefox', or 'webkit').\nThis option should not be used if `browser_pool` is provided." + } + ] + }, + "flags": {}, + "groups": [], + "id": 2239, + "module": "crawlers._playwright._playwright_crawler", + "name": "browser_type", + "parsedDocstring": { + "text": "The type of browser to launch ('chromium', 'firefox', or 'webkit').\nThis option should not be used if `browser_pool` is provided." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_playwright/_playwright_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 486 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "BrowserType", + "target": "2677" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Keyword arguments to pass to the browser launch method. These options are provided\ndirectly to Playwright's `browser_type.launch` method. For more details, refer to the Playwright\ndocumentation: https://playwright.dev/python/docs/api/class-browsertype#browser-type-launch.\nThis option should not be used if `browser_pool` is provided." + } + ] + }, + "flags": {}, + "groups": [], + "id": 2240, + "module": "crawlers._playwright._playwright_crawler", + "name": "browser_launch_options", + "parsedDocstring": { + "text": "Keyword arguments to pass to the browser launch method. These options are provided\ndirectly to Playwright's `browser_type.launch` method. For more details, refer to the Playwright\ndocumentation: https://playwright.dev/python/docs/api/class-browsertype#browser-type-launch.\nThis option should not be used if `browser_pool` is provided." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_playwright/_playwright_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 490 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Mapping", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Any" + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Keyword arguments to pass to the browser new context method. These options are provided directly to Playwright's\n`browser.new_context` method. For more details, refer to the Playwright documentation:\nhttps://playwright.dev/python/docs/api/class-browser#browser-new-context. This option should not be used if\n`browser_pool` is provided." + } + ] + }, + "flags": {}, + "groups": [], + "id": 2241, + "module": "crawlers._playwright._playwright_crawler", + "name": "browser_new_context_options", + "parsedDocstring": { + "text": "Keyword arguments to pass to the browser new context method. These options are provided directly to Playwright's\n`browser.new_context` method. For more details, refer to the Playwright documentation:\nhttps://playwright.dev/python/docs/api/class-browser#browser-new-context. This option should not be used if\n`browser_pool` is provided." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_playwright/_playwright_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 496 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Mapping", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Any" + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Whether to run the browser in headless mode. This option should not be used if `browser_pool` is provided." + } + ] + }, + "flags": {}, + "groups": [], + "id": 2242, + "module": "crawlers._playwright._playwright_crawler", + "name": "headless", + "parsedDocstring": { + "text": "Whether to run the browser in headless mode. This option should not be used if `browser_pool` is provided." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_playwright/_playwright_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 502 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Additional arguments for the `PlaywrightCrawler` constructor.\n\nIt is intended for typing forwarded `__init__` arguments in the subclasses.\nAll arguments are `BasicCrawlerOptions` + `_PlaywrightCrawlerAdditionalOptions`" + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 2240, + 2241, + 2238, + 2239, + 2242 + ], + "title": "Properties" + } + ], + "id": 2237, + "module": "crawlers._playwright._playwright_crawler", + "name": "_PlaywrightCrawlerAdditionalOptions", + "parsedDocstring": { + "text": "Additional arguments for the `PlaywrightCrawler` constructor.\n\nIt is intended for typing forwarded `__init__` arguments in the subclasses.\nAll arguments are `BasicCrawlerOptions` + `_PlaywrightCrawlerAdditionalOptions`" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_playwright/_playwright_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 476 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedBy": [ + { + "name": "PlaywrightCrawlerOptions", + "target": "2243", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A callable responsible for handling requests." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3278, + "module": "crawlers._basic._basic_crawler", + "name": "request_handler", + "parsedDocstring": { + "text": "A callable responsible for handling requests." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 196 + } + ], + "type": { + "name": "NotRequired[Callable[[TCrawlingContext], Awaitable[None]]]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptionsGeneric.request_handler", + "target": 2391, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A custom `Statistics` instance, allowing the use of non-default configuration." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3279, + "module": "crawlers._basic._basic_crawler", + "name": "statistics", + "parsedDocstring": { + "text": "A custom `Statistics` instance, allowing the use of non-default configuration." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 203 + } + ], + "type": { + "name": "NotRequired[Statistics[TStatisticsState]]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptionsGeneric.statistics", + "target": 2392, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The `Configuration` instance. Some of its properties are used as defaults for the crawler." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3280, + "module": "crawlers._basic._basic_crawler", + "name": "configuration", + "parsedDocstring": { + "text": "The `Configuration` instance. Some of its properties are used as defaults for the crawler." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 101 + } + ], + "type": { + "name": "NotRequired[Configuration]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.configuration", + "target": 2368, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The event manager for managing events for the crawler and all its components." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3281, + "module": "crawlers._basic._basic_crawler", + "name": "event_manager", + "parsedDocstring": { + "text": "The event manager for managing events for the crawler and all its components." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 104 + } + ], + "type": { + "name": "NotRequired[EventManager]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.event_manager", + "target": 2369, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The storage client for managing storages for the crawler and all its components." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3282, + "module": "crawlers._basic._basic_crawler", + "name": "storage_client", + "parsedDocstring": { + "text": "The storage client for managing storages for the crawler and all its components." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 107 + } + ], + "type": { + "name": "NotRequired[StorageClient]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.storage_client", + "target": 2370, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Manager of requests that should be processed by the crawler." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3283, + "module": "crawlers._basic._basic_crawler", + "name": "request_manager", + "parsedDocstring": { + "text": "Manager of requests that should be processed by the crawler." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 110 + } + ], + "type": { + "name": "NotRequired[RequestManager]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.request_manager", + "target": 2371, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A custom `SessionPool` instance, allowing the use of non-default configuration." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3284, + "module": "crawlers._basic._basic_crawler", + "name": "session_pool", + "parsedDocstring": { + "text": "A custom `SessionPool` instance, allowing the use of non-default configuration." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 113 + } + ], + "type": { + "name": "NotRequired[SessionPool]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.session_pool", + "target": 2372, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP proxy configuration used when making requests." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3285, + "module": "crawlers._basic._basic_crawler", + "name": "proxy_configuration", + "parsedDocstring": { + "text": "HTTP proxy configuration used when making requests." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 116 + } + ], + "type": { + "name": "NotRequired[ProxyConfiguration]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.proxy_configuration", + "target": 2373, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP client used by `BasicCrawlingContext.send_request` method." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3286, + "module": "crawlers._basic._basic_crawler", + "name": "http_client", + "parsedDocstring": { + "text": "HTTP client used by `BasicCrawlingContext.send_request` method." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 119 + } + ], + "type": { + "name": "NotRequired[HttpClient]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.http_client", + "target": 2374, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Specifies the maximum number of retries allowed for a request if its processing fails.\nThis includes retries due to navigation errors or errors thrown from user-supplied functions\n(`request_handler`, `pre_navigation_hooks` etc.).\n\nThis limit does not apply to retries triggered by session rotation (see `max_session_rotations`)." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3287, + "module": "crawlers._basic._basic_crawler", + "name": "max_request_retries", + "parsedDocstring": { + "text": "Specifies the maximum number of retries allowed for a request if its processing fails.\nThis includes retries due to navigation errors or errors thrown from user-supplied functions\n(`request_handler`, `pre_navigation_hooks` etc.).\n\nThis limit does not apply to retries triggered by session rotation (see `max_session_rotations`)." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 122 + } + ], + "type": { + "name": "NotRequired[int]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.max_request_retries", + "target": 2375, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum number of pages to open during a crawl. The crawl stops upon reaching this limit.\nSetting this value can help avoid infinite loops in misconfigured crawlers. `None` means no limit.\nDue to concurrency settings, the actual number of pages visited may slightly exceed this value." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3288, + "module": "crawlers._basic._basic_crawler", + "name": "max_requests_per_crawl", + "parsedDocstring": { + "text": "Maximum number of pages to open during a crawl. The crawl stops upon reaching this limit.\nSetting this value can help avoid infinite loops in misconfigured crawlers. `None` means no limit.\nDue to concurrency settings, the actual number of pages visited may slightly exceed this value." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 129 + } + ], + "type": { + "name": "NotRequired[int | None]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.max_requests_per_crawl", + "target": 2376, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum number of session rotations per request. The crawler rotates the session if a proxy error occurs\nor if the website blocks the request.\n\nThe session rotations are not counted towards the `max_request_retries` limit." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3289, + "module": "crawlers._basic._basic_crawler", + "name": "max_session_rotations", + "parsedDocstring": { + "text": "Maximum number of session rotations per request. The crawler rotates the session if a proxy error occurs\nor if the website blocks the request.\n\nThe session rotations are not counted towards the `max_request_retries` limit." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 134 + } + ], + "type": { + "name": "NotRequired[int]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.max_session_rotations", + "target": 2377, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Specifies the maximum crawl depth. If set, the crawler will stop processing links beyond this depth.\nThe crawl depth starts at 0 for initial requests and increases with each subsequent level of links.\nRequests at the maximum depth will still be processed, but no new links will be enqueued from those requests.\nIf not set, crawling continues without depth restrictions." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3290, + "module": "crawlers._basic._basic_crawler", + "name": "max_crawl_depth", + "parsedDocstring": { + "text": "Specifies the maximum crawl depth. If set, the crawler will stop processing links beyond this depth.\nThe crawl depth starts at 0 for initial requests and increases with each subsequent level of links.\nRequests at the maximum depth will still be processed, but no new links will be enqueued from those requests.\nIf not set, crawling continues without depth restrictions." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 141 + } + ], + "type": { + "name": "NotRequired[int | None]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.max_crawl_depth", + "target": 2378, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Enable the use of a session pool for managing sessions during crawling." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3291, + "module": "crawlers._basic._basic_crawler", + "name": "use_session_pool", + "parsedDocstring": { + "text": "Enable the use of a session pool for managing sessions during crawling." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 148 + } + ], + "type": { + "name": "NotRequired[bool]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.use_session_pool", + "target": 2379, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, the crawler attempts to bypass bot protections automatically." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3292, + "module": "crawlers._basic._basic_crawler", + "name": "retry_on_blocked", + "parsedDocstring": { + "text": "If True, the crawler attempts to bypass bot protections automatically." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 151 + } + ], + "type": { + "name": "NotRequired[bool]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.retry_on_blocked", + "target": 2380, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Settings to fine-tune concurrency levels." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3293, + "module": "crawlers._basic._basic_crawler", + "name": "concurrency_settings", + "parsedDocstring": { + "text": "Settings to fine-tune concurrency levels." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 154 + } + ], + "type": { + "name": "NotRequired[ConcurrencySettings]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.concurrency_settings", + "target": 2381, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum duration allowed for a single request handler to run." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3294, + "module": "crawlers._basic._basic_crawler", + "name": "request_handler_timeout", + "parsedDocstring": { + "text": "Maximum duration allowed for a single request handler to run." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 157 + } + ], + "type": { + "name": "NotRequired[timedelta]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.request_handler_timeout", + "target": 2382, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, the crawler stops immediately when any request handler error occurs." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3295, + "module": "crawlers._basic._basic_crawler", + "name": "abort_on_error", + "parsedDocstring": { + "text": "If True, the crawler stops immediately when any request handler error occurs." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 160 + } + ], + "type": { + "name": "NotRequired[bool]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.abort_on_error", + "target": 2383, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, the crawler will set up logging infrastructure automatically." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3296, + "module": "crawlers._basic._basic_crawler", + "name": "configure_logging", + "parsedDocstring": { + "text": "If True, the crawler will set up logging infrastructure automatically." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 163 + } + ], + "type": { + "name": "NotRequired[bool]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.configure_logging", + "target": 2384, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If 'table', displays crawler statistics as formatted tables in logs. If 'inline', outputs statistics as plain\ntext log messages." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3297, + "module": "crawlers._basic._basic_crawler", + "name": "statistics_log_format", + "parsedDocstring": { + "text": "If 'table', displays crawler statistics as formatted tables in logs. If 'inline', outputs statistics as plain\ntext log messages." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 166 + } + ], + "type": { + "name": "NotRequired[Literal['table', 'inline']]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.statistics_log_format", + "target": 2385, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Flag that can keep crawler running even when there are no requests in queue." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3298, + "module": "crawlers._basic._basic_crawler", + "name": "keep_alive", + "parsedDocstring": { + "text": "Flag that can keep crawler running even when there are no requests in queue." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 171 + } + ], + "type": { + "name": "NotRequired[bool]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.keep_alive", + "target": 2386, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Additional HTTP status codes to treat as errors, triggering automatic retries when encountered." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3299, + "module": "crawlers._basic._basic_crawler", + "name": "additional_http_error_status_codes", + "parsedDocstring": { + "text": "Additional HTTP status codes to treat as errors, triggering automatic retries when encountered." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 174 + } + ], + "type": { + "name": "NotRequired[Iterable[int]]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.additional_http_error_status_codes", + "target": 2387, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP status codes that are typically considered errors but should be treated as successful responses." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3300, + "module": "crawlers._basic._basic_crawler", + "name": "ignore_http_error_status_codes", + "parsedDocstring": { + "text": "HTTP status codes that are typically considered errors but should be treated as successful responses." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 177 + } + ], + "type": { + "name": "NotRequired[Iterable[int]]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.ignore_http_error_status_codes", + "target": 2388, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If set to `True`, the crawler will automatically try to fetch the robots.txt file for each domain,\nand skip those that are not allowed. This also prevents disallowed URLs to be added via `EnqueueLinksFunction`." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3301, + "module": "crawlers._basic._basic_crawler", + "name": "respect_robots_txt_file", + "parsedDocstring": { + "text": "If set to `True`, the crawler will automatically try to fetch the robots.txt file for each domain,\nand skip those that are not allowed. This also prevents disallowed URLs to be added via `EnqueueLinksFunction`." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 188 + } + ], + "type": { + "name": "NotRequired[bool]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.respect_robots_txt_file", + "target": 2389, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A `BrowserPool` instance to be used for launching the browsers and getting pages." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3302, + "module": "crawlers._playwright._playwright_crawler", + "name": "browser_pool", + "parsedDocstring": { + "text": "A `BrowserPool` instance to be used for launching the browsers and getting pages." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_playwright/_playwright_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 483 + } + ], + "type": { + "name": "NotRequired[BrowserPool]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_PlaywrightCrawlerAdditionalOptions.browser_pool", + "target": 2238, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The type of browser to launch ('chromium', 'firefox', or 'webkit').\nThis option should not be used if `browser_pool` is provided." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3303, + "module": "crawlers._playwright._playwright_crawler", + "name": "browser_type", + "parsedDocstring": { + "text": "The type of browser to launch ('chromium', 'firefox', or 'webkit').\nThis option should not be used if `browser_pool` is provided." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_playwright/_playwright_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 486 + } + ], + "type": { + "name": "NotRequired[BrowserType]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_PlaywrightCrawlerAdditionalOptions.browser_type", + "target": 2239, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Keyword arguments to pass to the browser launch method. These options are provided\ndirectly to Playwright's `browser_type.launch` method. For more details, refer to the Playwright\ndocumentation: https://playwright.dev/python/docs/api/class-browsertype#browser-type-launch.\nThis option should not be used if `browser_pool` is provided." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3304, + "module": "crawlers._playwright._playwright_crawler", + "name": "browser_launch_options", + "parsedDocstring": { + "text": "Keyword arguments to pass to the browser launch method. These options are provided\ndirectly to Playwright's `browser_type.launch` method. For more details, refer to the Playwright\ndocumentation: https://playwright.dev/python/docs/api/class-browsertype#browser-type-launch.\nThis option should not be used if `browser_pool` is provided." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_playwright/_playwright_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 490 + } + ], + "type": { + "name": "NotRequired[Mapping[str, Any]]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_PlaywrightCrawlerAdditionalOptions.browser_launch_options", + "target": 2240, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Keyword arguments to pass to the browser new context method. These options are provided directly to Playwright's\n`browser.new_context` method. For more details, refer to the Playwright documentation:\nhttps://playwright.dev/python/docs/api/class-browser#browser-new-context. This option should not be used if\n`browser_pool` is provided." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3305, + "module": "crawlers._playwright._playwright_crawler", + "name": "browser_new_context_options", + "parsedDocstring": { + "text": "Keyword arguments to pass to the browser new context method. These options are provided directly to Playwright's\n`browser.new_context` method. For more details, refer to the Playwright documentation:\nhttps://playwright.dev/python/docs/api/class-browser#browser-new-context. This option should not be used if\n`browser_pool` is provided." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_playwright/_playwright_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 496 + } + ], + "type": { + "name": "NotRequired[Mapping[str, Any]]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_PlaywrightCrawlerAdditionalOptions.browser_new_context_options", + "target": 2241, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Whether to run the browser in headless mode. This option should not be used if `browser_pool` is provided." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3306, + "module": "crawlers._playwright._playwright_crawler", + "name": "headless", + "parsedDocstring": { + "text": "Whether to run the browser in headless mode. This option should not be used if `browser_pool` is provided." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_playwright/_playwright_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 502 + } + ], + "type": { + "name": "NotRequired[bool]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_PlaywrightCrawlerAdditionalOptions.headless", + "target": 2242, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Arguments for the `AbstractHttpCrawler` constructor.\n\nIt is intended for typing forwarded `__init__` arguments in the subclasses." + } + ] + }, + "decorations": [ + { + "args": "('Data structures')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 3295, + 3299, + 3304, + 3305, + 3302, + 3303, + 3293, + 3280, + 3296, + 3281, + 3306, + 3286, + 3300, + 3298, + 3290, + 3287, + 3288, + 3289, + 3285, + 3278, + 3294, + 3283, + 3301, + 3292, + 3284, + 3279, + 3297, + 3282, + 3291 + ], + "title": "Properties" + } + ], + "id": 2243, + "module": "crawlers._playwright._playwright_crawler", + "name": "PlaywrightCrawlerOptions", + "parsedDocstring": { + "text": "Arguments for the `AbstractHttpCrawler` constructor.\n\nIt is intended for typing forwarded `__init__` arguments in the subclasses." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_playwright/_playwright_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 507 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "BasicCrawlerOptions", + "target": "2393", + "type": "reference" + }, + { + "name": "_PlaywrightCrawlerAdditionalOptions", + "target": "2237", + "type": "reference" + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Convert markup string or `Selector` to newline-separated plain text without tags using Parsel.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2244, + "module": "crawlers._parsel._utils", + "name": "html_to_text", + "parsedDocstring": { + "text": "Convert markup string or `Selector` to newline-separated plain text without tags using Parsel.\n", + "args": { + "source": "Input markup string or `Selector` object.\n" + }, + "returns": "Newline separated plain text without tags." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_parsel/_utils.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 16 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "Newline separated plain text without tags." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Convert markup string or `Selector` to newline-separated plain text without tags using Parsel.\n" + } + ] + }, + "flags": {}, + "id": 2245, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "html_to_text", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Input markup string or `Selector` object.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2246, + "kind": 32768, + "kindString": "Parameter", + "name": "source", + "type": { + "name": "str | Selector", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Selector" + } + ] + } + } + ], + "type": { + "name": "str", + "type": "reference" + } + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Parse http response.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2248, + "module": "crawlers._abstract_http._abstract_http_parser", + "name": "parse", + "parsedDocstring": { + "text": "Parse http response.\n", + "args": { + "response": "HTTP response to be parsed.\n" + }, + "returns": "Parsed HTTP response." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_parsel/_parsel_parser.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 21 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "Parsed HTTP response." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Parse http response.\n" + } + ] + }, + "flags": {}, + "id": 2644, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "parse", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP response to be parsed.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2645, + "kind": 32768, + "kindString": "Parameter", + "name": "response", + "type": { + "name": "HttpResponse", + "type": "reference", + "target": "1909" + } + } + ], + "type": { + "name": "TParseResult", + "type": "reference", + "target": "2623" + }, + "overwrites": { + "name": "AbstractHttpParser.parse", + "target": 2643, + "type": "reference" + } + } + ], + "overwrites": { + "name": "AbstractHttpParser.parse", + "target": 2643, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Parse text containing html.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2251, + "module": "crawlers._abstract_http._abstract_http_parser", + "name": "parse_text", + "parsedDocstring": { + "text": "Parse text containing html.\n", + "args": { + "text": "String containing html.\n" + }, + "returns": "Parsed text." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_parsel/_parsel_parser.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 25 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "Parsed text." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Parse text containing html.\n" + } + ] + }, + "flags": {}, + "id": 2647, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "parse_text", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "String containing html.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2648, + "kind": 32768, + "kindString": "Parameter", + "name": "text", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "TParseResult", + "type": "reference", + "target": "2623" + }, + "overwrites": { + "name": "AbstractHttpParser.parse_text", + "target": 2646, + "type": "reference" + } + } + ], + "overwrites": { + "name": "AbstractHttpParser.parse_text", + "target": 2646, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Use css selector to select page element and return it.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2254, + "module": "crawlers._abstract_http._abstract_http_parser", + "name": "select", + "parsedDocstring": { + "text": "Use css selector to select page element and return it.\n", + "args": { + "parsed_content": "Content where the page element will be located.", + "selector": "Css selector used to locate desired html element.\n" + }, + "returns": "Selected element." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_parsel/_parsel_parser.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 29 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "Selected element." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Use css selector to select page element and return it.\n" + } + ] + }, + "flags": {}, + "id": 2650, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "select", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Content where the page element will be located." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2651, + "kind": 32768, + "kindString": "Parameter", + "name": "parsed_content", + "type": { + "name": "TParseResult", + "type": "reference", + "target": "2623" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Css selector used to locate desired html element.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2652, + "kind": 32768, + "kindString": "Parameter", + "name": "selector", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "Sequence", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "TSelectResult", + "target": "2624" + } + ] + }, + "overwrites": { + "name": "AbstractHttpParser.select", + "target": 2649, + "type": "reference" + } + } + ], + "overwrites": { + "name": "AbstractHttpParser.select", + "target": 2649, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Find if selector has match in parsed content.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2258, + "module": "crawlers._abstract_http._abstract_http_parser", + "name": "is_matching_selector", + "parsedDocstring": { + "text": "Find if selector has match in parsed content.\n", + "args": { + "parsed_content": "Parsed HTTP response. Result of `parse` method.", + "selector": "String used to define matching pattern.\n" + }, + "returns": "True if selector has match in parsed content." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_parsel/_parsel_parser.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 33 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "True if selector has match in parsed content." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Find if selector has match in parsed content.\n" + } + ] + }, + "flags": {}, + "id": 2657, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "is_matching_selector", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Parsed HTTP response. Result of `parse` method." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2658, + "kind": 32768, + "kindString": "Parameter", + "name": "parsed_content", + "type": { + "name": "TParseResult", + "type": "reference", + "target": "2623" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "String used to define matching pattern.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2659, + "kind": 32768, + "kindString": "Parameter", + "name": "selector", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "bool", + "type": "reference" + }, + "overwrites": { + "name": "AbstractHttpParser.is_matching_selector", + "target": 2656, + "type": "reference" + } + } + ], + "overwrites": { + "name": "AbstractHttpParser.is_matching_selector", + "target": 2656, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Find all links in result using selector.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2262, + "module": "crawlers._abstract_http._abstract_http_parser", + "name": "find_links", + "parsedDocstring": { + "text": "Find all links in result using selector.\n", + "args": { + "parsed_content": "Parsed HTTP response. Result of `parse` method.", + "selector": "String used to define matching pattern for finding links.\n" + }, + "returns": "Iterable of strings that contain found links." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_parsel/_parsel_parser.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 37 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "Iterable of strings that contain found links." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Find all links in result using selector.\n" + } + ] + }, + "flags": {}, + "id": 2661, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "find_links", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Parsed HTTP response. Result of `parse` method." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2662, + "kind": 32768, + "kindString": "Parameter", + "name": "parsed_content", + "type": { + "name": "TParseResult", + "type": "reference", + "target": "2623" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "String used to define matching pattern for finding links.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2663, + "kind": 32768, + "kindString": "Parameter", + "name": "selector", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "Iterable", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + }, + "overwrites": { + "name": "AbstractHttpParser.find_links", + "target": 2660, + "type": "reference" + } + } + ], + "overwrites": { + "name": "AbstractHttpParser.find_links", + "target": 2660, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Detect if blocked and return BlockedInfo with additional information.\n\nDefault implementation that expects `is_matching_selector` abstract method to be implemented.\nOverride this method if your parser has different way of blockage detection.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3276, + "module": "crawlers._abstract_http._abstract_http_parser", + "name": "is_blocked", + "parsedDocstring": { + "text": "Detect if blocked and return BlockedInfo with additional information.\n\nDefault implementation that expects `is_matching_selector` abstract method to be implemented.\nOverride this method if your parser has different way of blockage detection.\n", + "args": { + "parsed_content": "Parsed HTTP response. Result of `parse` method.\n" + }, + "returns": "`BlockedInfo` object that contains non-empty string description of reason if blockage was detected. Empty\nstring in reason signifies no blockage detected." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_abstract_http/_abstract_http_parser.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 56 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "`BlockedInfo` object that contains non-empty string description of reason if blockage was detected. Empty\nstring in reason signifies no blockage detected." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Detect if blocked and return BlockedInfo with additional information.\n\nDefault implementation that expects `is_matching_selector` abstract method to be implemented.\nOverride this method if your parser has different way of blockage detection.\n" + } + ] + }, + "flags": {}, + "id": 2654, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "is_blocked", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Parsed HTTP response. Result of `parse` method.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2655, + "kind": 32768, + "kindString": "Parameter", + "name": "parsed_content", + "type": { + "name": "TParseResult", + "type": "reference", + "target": "2623" + } + } + ], + "type": { + "name": "BlockedInfo", + "type": "reference", + "target": "2150" + }, + "inheritedFrom": { + "name": "AbstractHttpParser.is_blocked", + "target": 2653, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "AbstractHttpParser.is_blocked", + "target": 2653, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Parser for parsing HTTP response using Parsel." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 2262, + 3276, + 2258, + 2248, + 2251, + 2254 + ], + "title": "Methods" + } + ], + "id": 2247, + "module": "crawlers._parsel._parsel_parser", + "name": "ParselParser", + "parsedDocstring": { + "text": "Parser for parsing HTTP response using Parsel." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_parsel/_parsel_parser.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 17 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "AbstractHttpParser", + "target": "2642", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Convenience alias." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 2267, + "module": "crawlers._parsel._parsel_crawling_context", + "name": "selector", + "parsedDocstring": { + "text": "Convenience alias." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_parsel/_parsel_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 21 + } + ], + "type": { + "name": "Selector", + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a new context from an existing `ParsedHttpCrawlingContext[Selector]`." + } + ] + }, + "decorations": [ + { + "name": "classmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2268, + "module": "crawlers._parsel._parsel_crawling_context", + "name": "from_parsed_http_crawling_context", + "parsedDocstring": { + "text": "Create a new context from an existing `ParsedHttpCrawlingContext[Selector]`." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_parsel/_parsel_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 26 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a new context from an existing `ParsedHttpCrawlingContext[Selector]`." + } + ] + }, + "flags": {}, + "id": 2269, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "from_parsed_http_crawling_context", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2270, + "kind": 32768, + "kindString": "Parameter", + "name": "context", + "type": { + "name": "ParsedHttpCrawlingContext", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Selector" + } + ], + "target": "2632" + } + } + ], + "type": { + "name": "Self", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Convert the parsed HTML content to newline-separated plain text without tags." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2271, + "module": "crawlers._parsel._parsel_crawling_context", + "name": "html_to_text", + "parsedDocstring": { + "text": "Convert the parsed HTML content to newline-separated plain text without tags." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_parsel/_parsel_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 30 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Convert the parsed HTML content to newline-separated plain text without tags." + } + ] + }, + "flags": {}, + "id": 2272, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "html_to_text", + "parameters": [], + "type": { + "name": "str", + "type": "reference" + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3453, + "module": "crawlers._abstract_http._http_crawling_context", + "name": "parsed_content", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_abstract_http/_http_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 40 + } + ], + "type": { + "name": "TParseResult", + "type": "reference", + "target": "2623" + }, + "inheritedFrom": { + "name": "ParsedHttpCrawlingContext.parsed_content", + "target": 2633, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3454, + "module": "crawlers._abstract_http._http_crawling_context", + "name": "enqueue_links", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_abstract_http/_http_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 41 + } + ], + "type": { + "name": "EnqueueLinksFunction", + "type": "reference", + "target": "235" + }, + "inheritedFrom": { + "name": "ParsedHttpCrawlingContext.enqueue_links", + "target": 2634, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3455, + "module": "crawlers._abstract_http._http_crawling_context", + "name": "extract_links", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_abstract_http/_http_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 42 + } + ], + "type": { + "name": "ExtractLinksFunction", + "type": "reference", + "target": "253" + }, + "inheritedFrom": { + "name": "ParsedHttpCrawlingContext.extract_links", + "target": 2635, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance from an existing `HttpCrawlingContext`." + } + ] + }, + "decorations": [ + { + "name": "classmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3456, + "module": "crawlers._abstract_http._http_crawling_context", + "name": "from_http_crawling_context", + "parsedDocstring": { + "text": "Initialize a new instance from an existing `HttpCrawlingContext`." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_abstract_http/_http_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 45 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance from an existing `HttpCrawlingContext`." + } + ] + }, + "flags": {}, + "id": 2637, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "from_http_crawling_context", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2638, + "kind": 32768, + "kindString": "Parameter", + "name": "context", + "type": { + "name": "HttpCrawlingContext", + "type": "reference", + "target": "2625" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2639, + "kind": 32768, + "kindString": "Parameter", + "name": "parsed_content", + "type": { + "name": "TParseResult", + "type": "reference", + "target": "2623" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2640, + "kind": 32768, + "kindString": "Parameter", + "name": "enqueue_links", + "type": { + "name": "EnqueueLinksFunction", + "type": "reference", + "target": "235" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2641, + "kind": 32768, + "kindString": "Parameter", + "name": "extract_links", + "type": { + "name": "ExtractLinksFunction", + "type": "reference", + "target": "253" + } + } + ], + "type": { + "name": "Self", + "type": "reference" + }, + "inheritedFrom": { + "name": "ParsedHttpCrawlingContext.from_http_crawling_context", + "target": 2636, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "ParsedHttpCrawlingContext.from_http_crawling_context", + "target": 2636, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance from an existing `BasicCrawlingContext`." + } + ] + }, + "decorations": [ + { + "name": "classmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3457, + "module": "crawlers._abstract_http._http_crawling_context", + "name": "from_basic_crawling_context", + "parsedDocstring": { + "text": "Initialize a new instance from an existing `BasicCrawlingContext`." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_abstract_http/_http_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 22 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance from an existing `BasicCrawlingContext`." + } + ] + }, + "flags": {}, + "id": 2627, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "from_basic_crawling_context", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2628, + "kind": 32768, + "kindString": "Parameter", + "name": "context", + "type": { + "name": "BasicCrawlingContext", + "type": "reference", + "target": "309" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2629, + "kind": 32768, + "kindString": "Parameter", + "name": "http_response", + "type": { + "name": "HttpResponse", + "type": "reference", + "target": "1909" + } + } + ], + "type": { + "name": "Self", + "type": "reference" + }, + "inheritedFrom": { + "name": "HttpCrawlingContext.from_basic_crawling_context", + "target": 2626, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "HttpCrawlingContext.from_basic_crawling_context", + "target": 2626, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get snapshot of crawled page." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3458, + "module": "crawlers._abstract_http._http_crawling_context", + "name": "get_snapshot", + "parsedDocstring": { + "text": "Get snapshot of crawled page." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_abstract_http/_http_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 27 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get snapshot of crawled page." + } + ] + }, + "flags": {}, + "id": 2631, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_snapshot", + "parameters": [], + "type": { + "name": "PageSnapshot", + "type": "reference", + "target": "297" + }, + "overwrites": { + "name": "BasicCrawlingContext.get_snapshot", + "target": 319, + "type": "reference" + }, + "inheritedFrom": { + "name": "HttpCrawlingContext.get_snapshot", + "target": 2630, + "type": "reference" + } + } + ], + "overwrites": { + "name": "BasicCrawlingContext.get_snapshot", + "target": 319, + "type": "reference" + }, + "inheritedFrom": { + "name": "HttpCrawlingContext.get_snapshot", + "target": 2630, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The HTTP response received from the server." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3459, + "module": "http_clients._base", + "name": "http_response", + "parsedDocstring": { + "text": "The HTTP response received from the server." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/http_clients/_base.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 70 + } + ], + "type": { + "name": "HttpResponse", + "type": "reference", + "target": "1909" + }, + "inheritedFrom": { + "name": "HttpCrawlingResult.http_response", + "target": 1918, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Request object for the current page being processed." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3460, + "module": "_types", + "name": "request", + "parsedDocstring": { + "text": "Request object for the current page being processed." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 627 + } + ], + "type": { + "name": "Request", + "type": "reference", + "target": "398" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.request", + "target": 310, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Session object for the current page being processed." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3461, + "module": "_types", + "name": "session", + "parsedDocstring": { + "text": "Session object for the current page being processed." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 630 + } + ], + "type": { + "name": "Session | None", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.session", + "target": 311, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Proxy information for the current page being processed." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3462, + "module": "_types", + "name": "proxy_info", + "parsedDocstring": { + "text": "Proxy information for the current page being processed." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 633 + } + ], + "type": { + "name": "ProxyInfo | None", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.proxy_info", + "target": 312, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Send request crawling context helper function." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3463, + "module": "_types", + "name": "send_request", + "parsedDocstring": { + "text": "Send request crawling context helper function." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 636 + } + ], + "type": { + "name": "SendRequestFunction", + "type": "reference", + "target": "290" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.send_request", + "target": 313, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Add requests crawling context helper function." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3464, + "module": "_types", + "name": "add_requests", + "parsedDocstring": { + "text": "Add requests crawling context helper function." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 639 + } + ], + "type": { + "name": "AddRequestsFunction", + "type": "reference", + "target": "230" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.add_requests", + "target": 314, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Push data crawling context helper function." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3465, + "module": "_types", + "name": "push_data", + "parsedDocstring": { + "text": "Push data crawling context helper function." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 642 + } + ], + "type": { + "name": "PushDataFunction", + "type": "reference", + "target": "283" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.push_data", + "target": 315, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Use state crawling context helper function." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3466, + "module": "_types", + "name": "use_state", + "parsedDocstring": { + "text": "Use state crawling context helper function." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 645 + } + ], + "type": { + "name": "UseStateFunction", + "type": "reference", + "target": "305" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.use_state", + "target": 316, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get key-value store crawling context helper function." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3467, + "module": "_types", + "name": "get_key_value_store", + "parsedDocstring": { + "text": "Get key-value store crawling context helper function." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 648 + } + ], + "type": { + "name": "GetKeyValueStoreFromRequestHandlerFunction", + "type": "reference", + "target": "278" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.get_key_value_store", + "target": 317, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Logger instance." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3468, + "module": "_types", + "name": "log", + "parsedDocstring": { + "text": "Logger instance." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 651 + } + ], + "type": { + "name": "logging.Logger", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.log", + "target": 318, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return hash of the context. Each context is considered unique." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3469, + "module": "_types", + "name": "__hash__", + "parsedDocstring": { + "text": "Return hash of the context. Each context is considered unique." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 658 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return hash of the context. Each context is considered unique." + } + ] + }, + "flags": {}, + "id": 322, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__hash__", + "parameters": [], + "type": { + "name": "int", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.__hash__", + "target": 321, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawlingContext.__hash__", + "target": 321, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The crawling context used by the `ParselCrawler`.\n\nIt provides access to key objects as well as utility functions for handling crawling tasks." + } + ] + }, + "decorations": [ + { + "args": "(frozen=True)", + "name": "dataclass" + }, + { + "args": "('Data structures')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 3469, + 3457, + 3456, + 2268, + 3458, + 2271 + ], + "title": "Methods" + }, + { + "children": [ + 3464, + 3454, + 3455, + 3467, + 3459, + 3468, + 3453, + 3462, + 3465, + 3460, + 2267, + 3463, + 3461, + 3466 + ], + "title": "Properties" + } + ], + "id": 2266, + "module": "crawlers._parsel._parsel_crawling_context", + "name": "ParselCrawlingContext", + "parsedDocstring": { + "text": "The crawling context used by the `ParselCrawler`.\n\nIt provides access to key objects as well as utility functions for handling crawling tasks." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_parsel/_parsel_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 14 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "ParsedHttpCrawlingContext", + "target": "2632", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2274, + "module": "crawlers._parsel._parsel_crawler", + "name": "__init__", + "parsedDocstring": { + "text": "Initialize a new instance.\n", + "args": { + "kwargs": "Additional keyword arguments to pass to the underlying `AbstractHttpCrawler`." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_parsel/_parsel_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 57 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n" + } + ] + }, + "flags": {}, + "id": 2275, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A callable responsible for handling requests." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 3252, + "module": "crawlers._basic._basic_crawler", + "name": "request_handler", + "parsedDocstring": { + "text": "A callable responsible for handling requests." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 196 + } + ], + "type": { + "name": "NotRequired[Callable[[TCrawlingContext], Awaitable[None]]]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptionsGeneric.request_handler", + "target": 2391, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A custom `Statistics` instance, allowing the use of non-default configuration." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 3253, + "module": "crawlers._basic._basic_crawler", + "name": "statistics", + "parsedDocstring": { + "text": "A custom `Statistics` instance, allowing the use of non-default configuration." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 203 + } + ], + "type": { + "name": "NotRequired[Statistics[TStatisticsState]]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptionsGeneric.statistics", + "target": 2392, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The `Configuration` instance. Some of its properties are used as defaults for the crawler." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 3254, + "module": "crawlers._basic._basic_crawler", + "name": "configuration", + "parsedDocstring": { + "text": "The `Configuration` instance. Some of its properties are used as defaults for the crawler." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 101 + } + ], + "type": { + "name": "NotRequired[Configuration]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.configuration", + "target": 2368, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The event manager for managing events for the crawler and all its components." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 3255, + "module": "crawlers._basic._basic_crawler", + "name": "event_manager", + "parsedDocstring": { + "text": "The event manager for managing events for the crawler and all its components." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 104 + } + ], + "type": { + "name": "NotRequired[EventManager]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.event_manager", + "target": 2369, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The storage client for managing storages for the crawler and all its components." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 3256, + "module": "crawlers._basic._basic_crawler", + "name": "storage_client", + "parsedDocstring": { + "text": "The storage client for managing storages for the crawler and all its components." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 107 + } + ], + "type": { + "name": "NotRequired[StorageClient]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.storage_client", + "target": 2370, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Manager of requests that should be processed by the crawler." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 3257, + "module": "crawlers._basic._basic_crawler", + "name": "request_manager", + "parsedDocstring": { + "text": "Manager of requests that should be processed by the crawler." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 110 + } + ], + "type": { + "name": "NotRequired[RequestManager]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.request_manager", + "target": 2371, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A custom `SessionPool` instance, allowing the use of non-default configuration." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 3258, + "module": "crawlers._basic._basic_crawler", + "name": "session_pool", + "parsedDocstring": { + "text": "A custom `SessionPool` instance, allowing the use of non-default configuration." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 113 + } + ], + "type": { + "name": "NotRequired[SessionPool]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.session_pool", + "target": 2372, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP proxy configuration used when making requests." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 3259, + "module": "crawlers._basic._basic_crawler", + "name": "proxy_configuration", + "parsedDocstring": { + "text": "HTTP proxy configuration used when making requests." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 116 + } + ], + "type": { + "name": "NotRequired[ProxyConfiguration]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.proxy_configuration", + "target": 2373, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP client used by `BasicCrawlingContext.send_request` method." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 3260, + "module": "crawlers._basic._basic_crawler", + "name": "http_client", + "parsedDocstring": { + "text": "HTTP client used by `BasicCrawlingContext.send_request` method." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 119 + } + ], + "type": { + "name": "NotRequired[HttpClient]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.http_client", + "target": 2374, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Specifies the maximum number of retries allowed for a request if its processing fails.\nThis includes retries due to navigation errors or errors thrown from user-supplied functions\n(`request_handler`, `pre_navigation_hooks` etc.).\n\nThis limit does not apply to retries triggered by session rotation (see `max_session_rotations`)." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 3261, + "module": "crawlers._basic._basic_crawler", + "name": "max_request_retries", + "parsedDocstring": { + "text": "Specifies the maximum number of retries allowed for a request if its processing fails.\nThis includes retries due to navigation errors or errors thrown from user-supplied functions\n(`request_handler`, `pre_navigation_hooks` etc.).\n\nThis limit does not apply to retries triggered by session rotation (see `max_session_rotations`)." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 122 + } + ], + "type": { + "name": "NotRequired[int]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.max_request_retries", + "target": 2375, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum number of pages to open during a crawl. The crawl stops upon reaching this limit.\nSetting this value can help avoid infinite loops in misconfigured crawlers. `None` means no limit.\nDue to concurrency settings, the actual number of pages visited may slightly exceed this value." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 3262, + "module": "crawlers._basic._basic_crawler", + "name": "max_requests_per_crawl", + "parsedDocstring": { + "text": "Maximum number of pages to open during a crawl. The crawl stops upon reaching this limit.\nSetting this value can help avoid infinite loops in misconfigured crawlers. `None` means no limit.\nDue to concurrency settings, the actual number of pages visited may slightly exceed this value." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 129 + } + ], + "type": { + "name": "NotRequired[int | None]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.max_requests_per_crawl", + "target": 2376, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum number of session rotations per request. The crawler rotates the session if a proxy error occurs\nor if the website blocks the request.\n\nThe session rotations are not counted towards the `max_request_retries` limit." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 3263, + "module": "crawlers._basic._basic_crawler", + "name": "max_session_rotations", + "parsedDocstring": { + "text": "Maximum number of session rotations per request. The crawler rotates the session if a proxy error occurs\nor if the website blocks the request.\n\nThe session rotations are not counted towards the `max_request_retries` limit." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 134 + } + ], + "type": { + "name": "NotRequired[int]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.max_session_rotations", + "target": 2377, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Specifies the maximum crawl depth. If set, the crawler will stop processing links beyond this depth.\nThe crawl depth starts at 0 for initial requests and increases with each subsequent level of links.\nRequests at the maximum depth will still be processed, but no new links will be enqueued from those requests.\nIf not set, crawling continues without depth restrictions." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 3264, + "module": "crawlers._basic._basic_crawler", + "name": "max_crawl_depth", + "parsedDocstring": { + "text": "Specifies the maximum crawl depth. If set, the crawler will stop processing links beyond this depth.\nThe crawl depth starts at 0 for initial requests and increases with each subsequent level of links.\nRequests at the maximum depth will still be processed, but no new links will be enqueued from those requests.\nIf not set, crawling continues without depth restrictions." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 141 + } + ], + "type": { + "name": "NotRequired[int | None]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.max_crawl_depth", + "target": 2378, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Enable the use of a session pool for managing sessions during crawling." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 3265, + "module": "crawlers._basic._basic_crawler", + "name": "use_session_pool", + "parsedDocstring": { + "text": "Enable the use of a session pool for managing sessions during crawling." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 148 + } + ], + "type": { + "name": "NotRequired[bool]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.use_session_pool", + "target": 2379, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, the crawler attempts to bypass bot protections automatically." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 3266, + "module": "crawlers._basic._basic_crawler", + "name": "retry_on_blocked", + "parsedDocstring": { + "text": "If True, the crawler attempts to bypass bot protections automatically." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 151 + } + ], + "type": { + "name": "NotRequired[bool]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.retry_on_blocked", + "target": 2380, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Settings to fine-tune concurrency levels." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 3267, + "module": "crawlers._basic._basic_crawler", + "name": "concurrency_settings", + "parsedDocstring": { + "text": "Settings to fine-tune concurrency levels." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 154 + } + ], + "type": { + "name": "NotRequired[ConcurrencySettings]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.concurrency_settings", + "target": 2381, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum duration allowed for a single request handler to run." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 3268, + "module": "crawlers._basic._basic_crawler", + "name": "request_handler_timeout", + "parsedDocstring": { + "text": "Maximum duration allowed for a single request handler to run." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 157 + } + ], + "type": { + "name": "NotRequired[timedelta]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.request_handler_timeout", + "target": 2382, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, the crawler stops immediately when any request handler error occurs." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 3269, + "module": "crawlers._basic._basic_crawler", + "name": "abort_on_error", + "parsedDocstring": { + "text": "If True, the crawler stops immediately when any request handler error occurs." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 160 + } + ], + "type": { + "name": "NotRequired[bool]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.abort_on_error", + "target": 2383, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, the crawler will set up logging infrastructure automatically." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 3270, + "module": "crawlers._basic._basic_crawler", + "name": "configure_logging", + "parsedDocstring": { + "text": "If True, the crawler will set up logging infrastructure automatically." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 163 + } + ], + "type": { + "name": "NotRequired[bool]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.configure_logging", + "target": 2384, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If 'table', displays crawler statistics as formatted tables in logs. If 'inline', outputs statistics as plain\ntext log messages." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 3271, + "module": "crawlers._basic._basic_crawler", + "name": "statistics_log_format", + "parsedDocstring": { + "text": "If 'table', displays crawler statistics as formatted tables in logs. If 'inline', outputs statistics as plain\ntext log messages." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 166 + } + ], + "type": { + "name": "NotRequired[Literal['table', 'inline']]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.statistics_log_format", + "target": 2385, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Flag that can keep crawler running even when there are no requests in queue." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 3272, + "module": "crawlers._basic._basic_crawler", + "name": "keep_alive", + "parsedDocstring": { + "text": "Flag that can keep crawler running even when there are no requests in queue." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 171 + } + ], + "type": { + "name": "NotRequired[bool]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.keep_alive", + "target": 2386, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Additional HTTP status codes to treat as errors, triggering automatic retries when encountered." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 3273, + "module": "crawlers._basic._basic_crawler", + "name": "additional_http_error_status_codes", + "parsedDocstring": { + "text": "Additional HTTP status codes to treat as errors, triggering automatic retries when encountered." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 174 + } + ], + "type": { + "name": "NotRequired[Iterable[int]]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.additional_http_error_status_codes", + "target": 2387, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP status codes that are typically considered errors but should be treated as successful responses." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 3274, + "module": "crawlers._basic._basic_crawler", + "name": "ignore_http_error_status_codes", + "parsedDocstring": { + "text": "HTTP status codes that are typically considered errors but should be treated as successful responses." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 177 + } + ], + "type": { + "name": "NotRequired[Iterable[int]]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.ignore_http_error_status_codes", + "target": 2388, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If set to `True`, the crawler will automatically try to fetch the robots.txt file for each domain,\nand skip those that are not allowed. This also prevents disallowed URLs to be added via `EnqueueLinksFunction`." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 3275, + "module": "crawlers._basic._basic_crawler", + "name": "respect_robots_txt_file", + "parsedDocstring": { + "text": "If set to `True`, the crawler will automatically try to fetch the robots.txt file for each domain,\nand skip those that are not allowed. This also prevents disallowed URLs to be added via `EnqueueLinksFunction`." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 188 + } + ], + "type": { + "name": "NotRequired[bool]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.respect_robots_txt_file", + "target": 2389, + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "AbstractHttpCrawler.__init__", + "target": 2667, + "type": "reference" + } + } + ], + "overwrites": { + "name": "AbstractHttpCrawler.__init__", + "target": 2667, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a specific version of `AbstractHttpCrawler` class.\n\nThis is a convenience factory method for creating a specific `AbstractHttpCrawler` subclass.\nWhile `AbstractHttpCrawler` allows its two generic parameters to be independent,\nthis method simplifies cases where `TParseResult` is used for both generic parameters." + } + ] + }, + "decorations": [ + { + "name": "classmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3355, + "module": "crawlers._abstract_http._abstract_http_crawler", + "name": "create_parsed_http_crawler_class", + "parsedDocstring": { + "text": "Create a specific version of `AbstractHttpCrawler` class.\n\nThis is a convenience factory method for creating a specific `AbstractHttpCrawler` subclass.\nWhile `AbstractHttpCrawler` allows its two generic parameters to be independent,\nthis method simplifies cases where `TParseResult` is used for both generic parameters." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 72 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a specific version of `AbstractHttpCrawler` class.\n\nThis is a convenience factory method for creating a specific `AbstractHttpCrawler` subclass.\nWhile `AbstractHttpCrawler` allows its two generic parameters to be independent,\nthis method simplifies cases where `TParseResult` is used for both generic parameters." + } + ] + }, + "flags": {}, + "id": 2672, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "create_parsed_http_crawler_class", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2673, + "kind": 32768, + "kindString": "Parameter", + "name": "static_parser", + "type": { + "name": "AbstractHttpParser[TParseResult, TSelectResult]", + "type": "reference" + } + } + ], + "type": { + "name": "type[AbstractHttpCrawler[ParsedHttpCrawlingContext[TParseResult], TParseResult, TSelectResult]]", + "type": "reference" + }, + "inheritedFrom": { + "name": "AbstractHttpCrawler.create_parsed_http_crawler_class", + "target": 2671, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "AbstractHttpCrawler.create_parsed_http_crawler_class", + "target": 2671, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a hook to be called before each navigation.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3356, + "module": "crawlers._abstract_http._abstract_http_crawler", + "name": "pre_navigation_hook", + "parsedDocstring": { + "text": "Register a hook to be called before each navigation.\n", + "args": { + "hook": "A coroutine function to be called before each navigation." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 264 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a hook to be called before each navigation.\n" + } + ] + }, + "flags": {}, + "id": 2675, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "pre_navigation_hook", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A coroutine function to be called before each navigation." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2676, + "kind": 32768, + "kindString": "Parameter", + "name": "hook", + "type": { + "name": "Callable[[BasicCrawlingContext], Awaitable[None]]", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "AbstractHttpCrawler.pre_navigation_hook", + "target": 2674, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "AbstractHttpCrawler.pre_navigation_hook", + "target": 2674, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The logger used by the crawler." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 3357, + "module": "crawlers._basic._basic_crawler", + "name": "log", + "parsedDocstring": { + "text": "The logger used by the crawler." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 440 + } + ], + "type": { + "name": "logging.Logger", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.log", + "target": 2424, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The `Router` used to handle each individual crawling request." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 3358, + "module": "crawlers._basic._basic_crawler", + "name": "router", + "parsedDocstring": { + "text": "The `Router` used to handle each individual crawling request." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 445 + } + ], + "type": { + "name": "Router[TCrawlingContext]", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.router", + "target": 2425, + "type": "reference" + }, + "overwrites": { + "name": "BasicCrawler.router", + "target": 2426, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Statistics about the current (or last) crawler run." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 3359, + "module": "crawlers._basic._basic_crawler", + "name": "statistics", + "parsedDocstring": { + "text": "Statistics about the current (or last) crawler run." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 460 + } + ], + "type": { + "name": "Statistics[TStatisticsState]", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.statistics", + "target": 2429, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Set flag to stop crawler.\n\nThis stops current crawler run regardless of whether all requests were finished.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3360, + "module": "crawlers._basic._basic_crawler", + "name": "stop", + "parsedDocstring": { + "text": "Set flag to stop crawler.\n\nThis stops current crawler run regardless of whether all requests were finished.\n", + "args": { + "reason": "Reason for stopping that will be used in logs." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 464 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Set flag to stop crawler.\n\nThis stops current crawler run regardless of whether all requests were finished.\n" + } + ] + }, + "flags": {}, + "id": 2431, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "stop", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Reason for stopping that will be used in logs." + } + ] + }, + "defaultValue": "'Stop was called externally.'", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 2432, + "kind": 32768, + "kindString": "Parameter", + "name": "reason", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.stop", + "target": 2430, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.stop", + "target": 2430, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the configured request manager. If none is configured, open and return the default request queue." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3361, + "module": "crawlers._basic._basic_crawler", + "name": "get_request_manager", + "parsedDocstring": { + "text": "Return the configured request manager. If none is configured, open and return the default request queue." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 524 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the configured request manager. If none is configured, open and return the default request queue." + } + ] + }, + "flags": {}, + "id": 2434, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_request_manager", + "parameters": [], + "type": { + "name": "RequestManager", + "type": "reference", + "target": "1752" + }, + "inheritedFrom": { + "name": "BasicCrawler.get_request_manager", + "target": 2433, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.get_request_manager", + "target": 2433, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the `Dataset` with the given ID or name. If none is provided, return the default one." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3362, + "module": "crawlers._basic._basic_crawler", + "name": "get_dataset", + "parsedDocstring": { + "text": "Return the `Dataset` with the given ID or name. If none is provided, return the default one." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 531 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the `Dataset` with the given ID or name. If none is provided, return the default one." + } + ] + }, + "flags": {}, + "id": 2436, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_dataset", + "parameters": [ + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2437, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2438, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str | None", + "type": "reference" + } + } + ], + "type": { + "name": "Dataset", + "type": "reference", + "target": "662" + }, + "inheritedFrom": { + "name": "BasicCrawler.get_dataset", + "target": 2435, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.get_dataset", + "target": 2435, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the `KeyValueStore` with the given ID or name. If none is provided, return the default KVS." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3363, + "module": "crawlers._basic._basic_crawler", + "name": "get_key_value_store", + "parsedDocstring": { + "text": "Return the `KeyValueStore` with the given ID or name. If none is provided, return the default KVS." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 540 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the `KeyValueStore` with the given ID or name. If none is provided, return the default KVS." + } + ] + }, + "flags": {}, + "id": 2440, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_key_value_store", + "parameters": [ + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2441, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2442, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str | None", + "type": "reference" + } + } + ], + "type": { + "name": "KeyValueStore", + "type": "reference", + "target": "569" + }, + "inheritedFrom": { + "name": "BasicCrawler.get_key_value_store", + "target": 2439, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.get_key_value_store", + "target": 2439, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a function to handle errors occurring in request handlers.\n\nThe error handler is invoked after a request handler error occurs and before a retry attempt." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3364, + "module": "crawlers._basic._basic_crawler", + "name": "error_handler", + "parsedDocstring": { + "text": "Register a function to handle errors occurring in request handlers.\n\nThe error handler is invoked after a request handler error occurs and before a retry attempt." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 549 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a function to handle errors occurring in request handlers.\n\nThe error handler is invoked after a request handler error occurs and before a retry attempt." + } + ] + }, + "flags": {}, + "id": 2444, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "error_handler", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2445, + "kind": 32768, + "kindString": "Parameter", + "name": "handler", + "type": { + "name": "ErrorHandler[TCrawlingContext | BasicCrawlingContext]", + "type": "reference" + } + } + ], + "type": { + "name": "ErrorHandler[TCrawlingContext]", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.error_handler", + "target": 2443, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.error_handler", + "target": 2443, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a function to handle requests that exceed the maximum retry limit.\n\nThe failed request handler is invoked when a request has failed all retry attempts." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3365, + "module": "crawlers._basic._basic_crawler", + "name": "failed_request_handler", + "parsedDocstring": { + "text": "Register a function to handle requests that exceed the maximum retry limit.\n\nThe failed request handler is invoked when a request has failed all retry attempts." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 559 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a function to handle requests that exceed the maximum retry limit.\n\nThe failed request handler is invoked when a request has failed all retry attempts." + } + ] + }, + "flags": {}, + "id": 2447, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "failed_request_handler", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2448, + "kind": 32768, + "kindString": "Parameter", + "name": "handler", + "type": { + "name": "FailedRequestHandler[TCrawlingContext | BasicCrawlingContext]", + "type": "reference" + } + } + ], + "type": { + "name": "FailedRequestHandler[TCrawlingContext]", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.failed_request_handler", + "target": 2446, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.failed_request_handler", + "target": 2446, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a function to handle skipped requests.\n\nThe skipped request handler is invoked when a request is skipped due to a collision or other reasons." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3366, + "module": "crawlers._basic._basic_crawler", + "name": "on_skipped_request", + "parsedDocstring": { + "text": "Register a function to handle skipped requests.\n\nThe skipped request handler is invoked when a request is skipped due to a collision or other reasons." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 569 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a function to handle skipped requests.\n\nThe skipped request handler is invoked when a request is skipped due to a collision or other reasons." + } + ] + }, + "flags": {}, + "id": 2450, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "on_skipped_request", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2451, + "kind": 32768, + "kindString": "Parameter", + "name": "callback", + "type": { + "name": "SkippedRequestCallback", + "type": "reference", + "target": "2366" + } + } + ], + "type": { + "name": "SkippedRequestCallback", + "type": "reference", + "target": "2366" + }, + "inheritedFrom": { + "name": "BasicCrawler.on_skipped_request", + "target": 2449, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.on_skipped_request", + "target": 2449, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Run the crawler until all requests are processed.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3367, + "module": "crawlers._basic._basic_crawler", + "name": "run", + "parsedDocstring": { + "text": "Run the crawler until all requests are processed.\n", + "args": { + "requests": "The requests to be enqueued before the crawler starts.", + "purge_request_queue": "If this is `True` and the crawler is not being run for the first time, the default\nrequest queue will be purged." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 577 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Run the crawler until all requests are processed.\n" + } + ] + }, + "flags": {}, + "id": 2453, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "run", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The requests to be enqueued before the crawler starts." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 2454, + "kind": 32768, + "kindString": "Parameter", + "name": "requests", + "type": { + "name": "Sequence[str | Request] | None", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "If this is `True` and the crawler is not being run for the first time, the default\nrequest queue will be purged." + } + ] + }, + "defaultValue": "True", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2455, + "kind": 32768, + "kindString": "Parameter", + "name": "purge_request_queue", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "FinalStatistics", + "type": "reference", + "target": "1453" + }, + "inheritedFrom": { + "name": "BasicCrawler.run", + "target": 2452, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.run", + "target": 2452, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Add requests to the underlying request manager in batches.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3368, + "module": "crawlers._basic._basic_crawler", + "name": "add_requests", + "parsedDocstring": { + "text": "Add requests to the underlying request manager in batches.\n", + "args": { + "requests": "A list of requests to add to the queue.", + "batch_size": "The number of requests to add in one batch.", + "wait_time_between_batches": "Time to wait between adding batches.", + "wait_for_all_requests_to_be_added": "If True, wait for all requests to be added before returning.", + "wait_for_all_requests_to_be_added_timeout": "Timeout for waiting for all requests to be added." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 685 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Add requests to the underlying request manager in batches.\n" + } + ] + }, + "flags": {}, + "id": 2457, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "add_requests", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A list of requests to add to the queue." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2458, + "kind": 32768, + "kindString": "Parameter", + "name": "requests", + "type": { + "name": "Sequence[str | Request]", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The number of requests to add in one batch." + } + ] + }, + "defaultValue": "1000", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2459, + "kind": 32768, + "kindString": "Parameter", + "name": "batch_size", + "type": { + "name": "int", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Time to wait between adding batches." + } + ] + }, + "defaultValue": "timedelta(0)", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2460, + "kind": 32768, + "kindString": "Parameter", + "name": "wait_time_between_batches", + "type": { + "name": "timedelta", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, wait for all requests to be added before returning." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2461, + "kind": 32768, + "kindString": "Parameter", + "name": "wait_for_all_requests_to_be_added", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Timeout for waiting for all requests to be added." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2462, + "kind": 32768, + "kindString": "Parameter", + "name": "wait_for_all_requests_to_be_added_timeout", + "type": { + "name": "timedelta | None", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.add_requests", + "target": 2456, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.add_requests", + "target": 2456, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Retrieve data from a `Dataset`.\n\nThis helper method simplifies the process of retrieving data from a `Dataset`. It opens the specified\none and then retrieves the data based on the provided parameters.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3369, + "module": "crawlers._basic._basic_crawler", + "name": "get_data", + "parsedDocstring": { + "text": "Retrieve data from a `Dataset`.\n\nThis helper method simplifies the process of retrieving data from a `Dataset`. It opens the specified\none and then retrieves the data based on the provided parameters.\n", + "args": { + "dataset_id": "The ID of the `Dataset`.", + "dataset_name": "The name of the `Dataset`.", + "kwargs": "Keyword arguments to be passed to the `Dataset.get_data()` method.\n" + }, + "returns": "The retrieved data." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 738 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "The retrieved data." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Retrieve data from a `Dataset`.\n\nThis helper method simplifies the process of retrieving data from a `Dataset`. It opens the specified\none and then retrieves the data based on the provided parameters.\n" + } + ] + }, + "flags": {}, + "id": 2464, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_data", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The ID of the `Dataset`." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 2465, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_id", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The name of the `Dataset`." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 2466, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_name", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Keyword arguments to be passed to the `Dataset.get_data()` method.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2467, + "kind": 32768, + "kindString": "Parameter", + "name": "kwargs", + "type": { + "name": "Unpack[GetDataKwargs]", + "type": "reference" + } + } + ], + "type": { + "name": "DatasetItemsListPage", + "type": "reference", + "target": "823" + }, + "inheritedFrom": { + "name": "BasicCrawler.get_data", + "target": 2463, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.get_data", + "target": 2463, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Export data from a `Dataset`.\n\nThis helper method simplifies the process of exporting data from a `Dataset`. It opens the specified\none and then exports the data based on the provided parameters. If you need to pass options\nspecific to the output format, use the `export_data_csv` or `export_data_json` method instead.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3370, + "module": "crawlers._basic._basic_crawler", + "name": "export_data", + "parsedDocstring": { + "text": "Export data from a `Dataset`.\n\nThis helper method simplifies the process of exporting data from a `Dataset`. It opens the specified\none and then exports the data based on the provided parameters. If you need to pass options\nspecific to the output format, use the `export_data_csv` or `export_data_json` method instead.\n", + "args": { + "path": "The destination path.", + "dataset_id": "The ID of the `Dataset`.", + "dataset_name": "The name of the `Dataset`." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 760 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Export data from a `Dataset`.\n\nThis helper method simplifies the process of exporting data from a `Dataset`. It opens the specified\none and then exports the data based on the provided parameters. If you need to pass options\nspecific to the output format, use the `export_data_csv` or `export_data_json` method instead.\n" + } + ] + }, + "flags": {}, + "id": 2469, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "export_data", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The destination path." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2470, + "kind": 32768, + "kindString": "Parameter", + "name": "path", + "type": { + "name": "str | Path", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The ID of the `Dataset`." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 2471, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_id", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The name of the `Dataset`." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 2472, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_name", + "type": { + "name": "str | None", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.export_data", + "target": 2468, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.export_data", + "target": 2468, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Export data from a `Dataset` to a CSV file.\n\nThis helper method simplifies the process of exporting data from a `Dataset` in csv format. It opens\nthe specified one and then exports the data based on the provided parameters.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3371, + "module": "crawlers._basic._basic_crawler", + "name": "export_data_csv", + "parsedDocstring": { + "text": "Export data from a `Dataset` to a CSV file.\n\nThis helper method simplifies the process of exporting data from a `Dataset` in csv format. It opens\nthe specified one and then exports the data based on the provided parameters.\n", + "args": { + "path": "The destination path.", + "content_type": "The output format.", + "dataset_id": "The ID of the `Dataset`.", + "dataset_name": "The name of the `Dataset`.", + "kwargs": "Extra configurations for dumping/writing in csv format." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 789 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Export data from a `Dataset` to a CSV file.\n\nThis helper method simplifies the process of exporting data from a `Dataset` in csv format. It opens\nthe specified one and then exports the data based on the provided parameters.\n" + } + ] + }, + "flags": {}, + "id": 2474, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "export_data_csv", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The destination path." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2475, + "kind": 32768, + "kindString": "Parameter", + "name": "path", + "type": { + "name": "str | Path", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The ID of the `Dataset`." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2476, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_id", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The name of the `Dataset`." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2477, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_name", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Extra configurations for dumping/writing in csv format." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2478, + "kind": 32768, + "kindString": "Parameter", + "name": "kwargs", + "type": { + "name": "Unpack[ExportDataCsvKwargs]", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.export_data_csv", + "target": 2473, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.export_data_csv", + "target": 2473, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Export data from a `Dataset` to a JSON file.\n\nThis helper method simplifies the process of exporting data from a `Dataset` in json format. It opens the\nspecified one and then exports the data based on the provided parameters.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3372, + "module": "crawlers._basic._basic_crawler", + "name": "export_data_json", + "parsedDocstring": { + "text": "Export data from a `Dataset` to a JSON file.\n\nThis helper method simplifies the process of exporting data from a `Dataset` in json format. It opens the\nspecified one and then exports the data based on the provided parameters.\n", + "args": { + "path": "The destination path", + "dataset_id": "The ID of the `Dataset`.", + "dataset_name": "The name of the `Dataset`.", + "kwargs": "Extra configurations for dumping/writing in json format." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 814 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Export data from a `Dataset` to a JSON file.\n\nThis helper method simplifies the process of exporting data from a `Dataset` in json format. It opens the\nspecified one and then exports the data based on the provided parameters.\n" + } + ] + }, + "flags": {}, + "id": 2480, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "export_data_json", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The destination path" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2481, + "kind": 32768, + "kindString": "Parameter", + "name": "path", + "type": { + "name": "str | Path", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The ID of the `Dataset`." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2482, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_id", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The name of the `Dataset`." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2483, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_name", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Extra configurations for dumping/writing in json format." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2484, + "kind": 32768, + "kindString": "Parameter", + "name": "kwargs", + "type": { + "name": "Unpack[ExportDataJsonKwargs]", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.export_data_json", + "target": 2479, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.export_data_json", + "target": 2479, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A web crawler for performing HTTP requests and parsing HTML/XML content.\n\nThe `ParselCrawler` builds on top of the `AbstractHttpCrawler`, which means it inherits all of its features.\nIt specifies its own parser `ParselParser` which is used to parse `HttpResponse`.\n`ParselParser` uses following library for parsing: https://pypi.org/project/parsel/\n\nThe HTTP client-based crawlers are ideal for websites that do not require JavaScript execution. However,\nif you need to execute client-side JavaScript, consider using browser-based crawler like the `PlaywrightCrawler`.\n\n### Usage\n\n```python\nfrom crawlee.crawlers import ParselCrawler, ParselCrawlingContext\n\ncrawler = ParselCrawler()\n\n# Define the default request handler, which will be called for every request.\n@crawler.router.default_handler\nasync def request_handler(context: ParselCrawlingContext) -> None:\n context.log.info(f'Processing {context.request.url} ...')\n\n # Extract data from the page.\n data = {\n 'url': context.request.url,\n 'title': context.selector.css('title').get(),\n }\n\n # Push the extracted data to the default dataset.\n await context.push_data(data)\n\nawait crawler.run(['https://crawlee.dev/'])\n```" + } + ] + }, + "decorations": [ + { + "args": "('Classes')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 2274, + 3368, + 3355, + 3364, + 3370, + 3371, + 3372, + 3365, + 3369, + 3362, + 3363, + 3361, + 3366, + 3356, + 3367, + 3360 + ], + "title": "Methods" + }, + { + "children": [ + 3357, + 3358, + 3359 + ], + "title": "Properties" + } + ], + "id": 2273, + "module": "crawlers._parsel._parsel_crawler", + "name": "ParselCrawler", + "parsedDocstring": { + "text": "A web crawler for performing HTTP requests and parsing HTML/XML content.\n\nThe `ParselCrawler` builds on top of the `AbstractHttpCrawler`, which means it inherits all of its features.\nIt specifies its own parser `ParselParser` which is used to parse `HttpResponse`.\n`ParselParser` uses following library for parsing: https://pypi.org/project/parsel/\n\nThe HTTP client-based crawlers are ideal for websites that do not require JavaScript execution. However,\nif you need to execute client-side JavaScript, consider using browser-based crawler like the `PlaywrightCrawler`.\n\n### Usage\n\n```python\nfrom crawlee.crawlers import ParselCrawler, ParselCrawlingContext\n\ncrawler = ParselCrawler()\n\n# Define the default request handler, which will be called for every request.\n@crawler.router.default_handler\nasync def request_handler(context: ParselCrawlingContext) -> None:\n context.log.info(f'Processing {context.request.url} ...')\n\n # Extract data from the page.\n data = {\n 'url': context.request.url,\n 'title': context.selector.css('title').get(),\n }\n\n # Push the extracted data to the default dataset.\n await context.push_data(data)\n\nawait crawler.run(['https://crawlee.dev/'])\n```" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_parsel/_parsel_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 22 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "AbstractHttpCrawler", + "target": "2666", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Parse http response.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2278, + "module": "crawlers._abstract_http._abstract_http_parser", + "name": "parse", + "parsedDocstring": { + "text": "Parse http response.\n", + "args": { + "response": "HTTP response to be parsed.\n" + }, + "returns": "Parsed HTTP response." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_http/_http_parser.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 23 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "Parsed HTTP response." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Parse http response.\n" + } + ] + }, + "flags": {}, + "id": 2644, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "parse", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP response to be parsed.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2645, + "kind": 32768, + "kindString": "Parameter", + "name": "response", + "type": { + "name": "HttpResponse", + "type": "reference", + "target": "1909" + } + } + ], + "type": { + "name": "TParseResult", + "type": "reference", + "target": "2623" + }, + "overwrites": { + "name": "AbstractHttpParser.parse", + "target": 2643, + "type": "reference" + } + } + ], + "overwrites": { + "name": "AbstractHttpParser.parse", + "target": 2643, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Parse text containing html.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2281, + "module": "crawlers._abstract_http._abstract_http_parser", + "name": "parse_text", + "parsedDocstring": { + "text": "Parse text containing html.\n", + "args": { + "text": "String containing html.\n" + }, + "returns": "Parsed text." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_http/_http_parser.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 27 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "Parsed text." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Parse text containing html.\n" + } + ] + }, + "flags": {}, + "id": 2647, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "parse_text", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "String containing html.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2648, + "kind": 32768, + "kindString": "Parameter", + "name": "text", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "TParseResult", + "type": "reference", + "target": "2623" + }, + "overwrites": { + "name": "AbstractHttpParser.parse_text", + "target": 2646, + "type": "reference" + } + } + ], + "overwrites": { + "name": "AbstractHttpParser.parse_text", + "target": 2646, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Use css selector to select page element and return it.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2284, + "module": "crawlers._abstract_http._abstract_http_parser", + "name": "select", + "parsedDocstring": { + "text": "Use css selector to select page element and return it.\n", + "args": { + "parsed_content": "Content where the page element will be located.", + "selector": "Css selector used to locate desired html element.\n" + }, + "returns": "Selected element." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_http/_http_parser.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 31 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "Selected element." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Use css selector to select page element and return it.\n" + } + ] + }, + "flags": {}, + "id": 2650, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "select", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Content where the page element will be located." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2651, + "kind": 32768, + "kindString": "Parameter", + "name": "parsed_content", + "type": { + "name": "TParseResult", + "type": "reference", + "target": "2623" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Css selector used to locate desired html element.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2652, + "kind": 32768, + "kindString": "Parameter", + "name": "selector", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "Sequence", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "TSelectResult", + "target": "2624" + } + ] + }, + "overwrites": { + "name": "AbstractHttpParser.select", + "target": 2649, + "type": "reference" + } + } + ], + "overwrites": { + "name": "AbstractHttpParser.select", + "target": 2649, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Detect if blocked and return BlockedInfo with additional information.\n\nDefault implementation that expects `is_matching_selector` abstract method to be implemented.\nOverride this method if your parser has different way of blockage detection.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2288, + "module": "crawlers._abstract_http._abstract_http_parser", + "name": "is_blocked", + "parsedDocstring": { + "text": "Detect if blocked and return BlockedInfo with additional information.\n\nDefault implementation that expects `is_matching_selector` abstract method to be implemented.\nOverride this method if your parser has different way of blockage detection.\n", + "args": { + "parsed_content": "Parsed HTTP response. Result of `parse` method.\n" + }, + "returns": "`BlockedInfo` object that contains non-empty string description of reason if blockage was detected. Empty\nstring in reason signifies no blockage detected." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_http/_http_parser.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 35 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "`BlockedInfo` object that contains non-empty string description of reason if blockage was detected. Empty\nstring in reason signifies no blockage detected." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Detect if blocked and return BlockedInfo with additional information.\n\nDefault implementation that expects `is_matching_selector` abstract method to be implemented.\nOverride this method if your parser has different way of blockage detection.\n" + } + ] + }, + "flags": {}, + "id": 2654, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "is_blocked", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Parsed HTTP response. Result of `parse` method.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2655, + "kind": 32768, + "kindString": "Parameter", + "name": "parsed_content", + "type": { + "name": "TParseResult", + "type": "reference", + "target": "2623" + } + } + ], + "type": { + "name": "BlockedInfo", + "type": "reference", + "target": "2150" + }, + "overwrites": { + "name": "AbstractHttpParser.is_blocked", + "target": 2653, + "type": "reference" + } + } + ], + "overwrites": { + "name": "AbstractHttpParser.is_blocked", + "target": 2653, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Find if selector has match in parsed content.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2291, + "module": "crawlers._abstract_http._abstract_http_parser", + "name": "is_matching_selector", + "parsedDocstring": { + "text": "Find if selector has match in parsed content.\n", + "args": { + "parsed_content": "Parsed HTTP response. Result of `parse` method.", + "selector": "String used to define matching pattern.\n" + }, + "returns": "True if selector has match in parsed content." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_http/_http_parser.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 39 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "True if selector has match in parsed content." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Find if selector has match in parsed content.\n" + } + ] + }, + "flags": {}, + "id": 2657, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "is_matching_selector", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Parsed HTTP response. Result of `parse` method." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2658, + "kind": 32768, + "kindString": "Parameter", + "name": "parsed_content", + "type": { + "name": "TParseResult", + "type": "reference", + "target": "2623" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "String used to define matching pattern.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2659, + "kind": 32768, + "kindString": "Parameter", + "name": "selector", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "bool", + "type": "reference" + }, + "overwrites": { + "name": "AbstractHttpParser.is_matching_selector", + "target": 2656, + "type": "reference" + } + } + ], + "overwrites": { + "name": "AbstractHttpParser.is_matching_selector", + "target": 2656, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Find all links in result using selector.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2295, + "module": "crawlers._abstract_http._abstract_http_parser", + "name": "find_links", + "parsedDocstring": { + "text": "Find all links in result using selector.\n", + "args": { + "parsed_content": "Parsed HTTP response. Result of `parse` method.", + "selector": "String used to define matching pattern for finding links.\n" + }, + "returns": "Iterable of strings that contain found links." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_http/_http_parser.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 43 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "Iterable of strings that contain found links." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Find all links in result using selector.\n" + } + ] + }, + "flags": {}, + "id": 2661, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "find_links", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Parsed HTTP response. Result of `parse` method." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2662, + "kind": 32768, + "kindString": "Parameter", + "name": "parsed_content", + "type": { + "name": "TParseResult", + "type": "reference", + "target": "2623" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "String used to define matching pattern for finding links.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2663, + "kind": 32768, + "kindString": "Parameter", + "name": "selector", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "Iterable", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + }, + "overwrites": { + "name": "AbstractHttpParser.find_links", + "target": 2660, + "type": "reference" + } + } + ], + "overwrites": { + "name": "AbstractHttpParser.find_links", + "target": 2660, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Dummy parser for backwards compatibility.\n\nTo enable using `HttpCrawler` without need for additional specific parser." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 2295, + 2288, + 2291, + 2278, + 2281, + 2284 + ], + "title": "Methods" + } + ], + "id": 2277, + "module": "crawlers._http._http_parser", + "name": "NoParser", + "parsedDocstring": { + "text": "Dummy parser for backwards compatibility.\n\nTo enable using `HttpCrawler` without need for additional specific parser." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_http/_http_parser.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 16 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "AbstractHttpParser", + "target": "2642", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2300, + "module": "crawlers._http._http_crawler", + "name": "__init__", + "parsedDocstring": { + "text": "Initialize a new instance.\n", + "args": { + "kwargs": "Additional keyword arguments to pass to the underlying `AbstractHttpCrawler`." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_http/_http_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 49 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n" + } + ] + }, + "flags": {}, + "id": 2301, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A callable responsible for handling requests." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 3252, + "module": "crawlers._basic._basic_crawler", + "name": "request_handler", + "parsedDocstring": { + "text": "A callable responsible for handling requests." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 196 + } + ], + "type": { + "name": "NotRequired[Callable[[TCrawlingContext], Awaitable[None]]]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptionsGeneric.request_handler", + "target": 2391, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A custom `Statistics` instance, allowing the use of non-default configuration." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 3253, + "module": "crawlers._basic._basic_crawler", + "name": "statistics", + "parsedDocstring": { + "text": "A custom `Statistics` instance, allowing the use of non-default configuration." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 203 + } + ], + "type": { + "name": "NotRequired[Statistics[TStatisticsState]]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptionsGeneric.statistics", + "target": 2392, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The `Configuration` instance. Some of its properties are used as defaults for the crawler." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 3254, + "module": "crawlers._basic._basic_crawler", + "name": "configuration", + "parsedDocstring": { + "text": "The `Configuration` instance. Some of its properties are used as defaults for the crawler." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 101 + } + ], + "type": { + "name": "NotRequired[Configuration]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.configuration", + "target": 2368, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The event manager for managing events for the crawler and all its components." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 3255, + "module": "crawlers._basic._basic_crawler", + "name": "event_manager", + "parsedDocstring": { + "text": "The event manager for managing events for the crawler and all its components." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 104 + } + ], + "type": { + "name": "NotRequired[EventManager]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.event_manager", + "target": 2369, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The storage client for managing storages for the crawler and all its components." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 3256, + "module": "crawlers._basic._basic_crawler", + "name": "storage_client", + "parsedDocstring": { + "text": "The storage client for managing storages for the crawler and all its components." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 107 + } + ], + "type": { + "name": "NotRequired[StorageClient]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.storage_client", + "target": 2370, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Manager of requests that should be processed by the crawler." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 3257, + "module": "crawlers._basic._basic_crawler", + "name": "request_manager", + "parsedDocstring": { + "text": "Manager of requests that should be processed by the crawler." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 110 + } + ], + "type": { + "name": "NotRequired[RequestManager]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.request_manager", + "target": 2371, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A custom `SessionPool` instance, allowing the use of non-default configuration." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 3258, + "module": "crawlers._basic._basic_crawler", + "name": "session_pool", + "parsedDocstring": { + "text": "A custom `SessionPool` instance, allowing the use of non-default configuration." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 113 + } + ], + "type": { + "name": "NotRequired[SessionPool]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.session_pool", + "target": 2372, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP proxy configuration used when making requests." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 3259, + "module": "crawlers._basic._basic_crawler", + "name": "proxy_configuration", + "parsedDocstring": { + "text": "HTTP proxy configuration used when making requests." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 116 + } + ], + "type": { + "name": "NotRequired[ProxyConfiguration]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.proxy_configuration", + "target": 2373, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP client used by `BasicCrawlingContext.send_request` method." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 3260, + "module": "crawlers._basic._basic_crawler", + "name": "http_client", + "parsedDocstring": { + "text": "HTTP client used by `BasicCrawlingContext.send_request` method." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 119 + } + ], + "type": { + "name": "NotRequired[HttpClient]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.http_client", + "target": 2374, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Specifies the maximum number of retries allowed for a request if its processing fails.\nThis includes retries due to navigation errors or errors thrown from user-supplied functions\n(`request_handler`, `pre_navigation_hooks` etc.).\n\nThis limit does not apply to retries triggered by session rotation (see `max_session_rotations`)." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 3261, + "module": "crawlers._basic._basic_crawler", + "name": "max_request_retries", + "parsedDocstring": { + "text": "Specifies the maximum number of retries allowed for a request if its processing fails.\nThis includes retries due to navigation errors or errors thrown from user-supplied functions\n(`request_handler`, `pre_navigation_hooks` etc.).\n\nThis limit does not apply to retries triggered by session rotation (see `max_session_rotations`)." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 122 + } + ], + "type": { + "name": "NotRequired[int]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.max_request_retries", + "target": 2375, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum number of pages to open during a crawl. The crawl stops upon reaching this limit.\nSetting this value can help avoid infinite loops in misconfigured crawlers. `None` means no limit.\nDue to concurrency settings, the actual number of pages visited may slightly exceed this value." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 3262, + "module": "crawlers._basic._basic_crawler", + "name": "max_requests_per_crawl", + "parsedDocstring": { + "text": "Maximum number of pages to open during a crawl. The crawl stops upon reaching this limit.\nSetting this value can help avoid infinite loops in misconfigured crawlers. `None` means no limit.\nDue to concurrency settings, the actual number of pages visited may slightly exceed this value." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 129 + } + ], + "type": { + "name": "NotRequired[int | None]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.max_requests_per_crawl", + "target": 2376, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum number of session rotations per request. The crawler rotates the session if a proxy error occurs\nor if the website blocks the request.\n\nThe session rotations are not counted towards the `max_request_retries` limit." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 3263, + "module": "crawlers._basic._basic_crawler", + "name": "max_session_rotations", + "parsedDocstring": { + "text": "Maximum number of session rotations per request. The crawler rotates the session if a proxy error occurs\nor if the website blocks the request.\n\nThe session rotations are not counted towards the `max_request_retries` limit." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 134 + } + ], + "type": { + "name": "NotRequired[int]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.max_session_rotations", + "target": 2377, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Specifies the maximum crawl depth. If set, the crawler will stop processing links beyond this depth.\nThe crawl depth starts at 0 for initial requests and increases with each subsequent level of links.\nRequests at the maximum depth will still be processed, but no new links will be enqueued from those requests.\nIf not set, crawling continues without depth restrictions." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 3264, + "module": "crawlers._basic._basic_crawler", + "name": "max_crawl_depth", + "parsedDocstring": { + "text": "Specifies the maximum crawl depth. If set, the crawler will stop processing links beyond this depth.\nThe crawl depth starts at 0 for initial requests and increases with each subsequent level of links.\nRequests at the maximum depth will still be processed, but no new links will be enqueued from those requests.\nIf not set, crawling continues without depth restrictions." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 141 + } + ], + "type": { + "name": "NotRequired[int | None]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.max_crawl_depth", + "target": 2378, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Enable the use of a session pool for managing sessions during crawling." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 3265, + "module": "crawlers._basic._basic_crawler", + "name": "use_session_pool", + "parsedDocstring": { + "text": "Enable the use of a session pool for managing sessions during crawling." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 148 + } + ], + "type": { + "name": "NotRequired[bool]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.use_session_pool", + "target": 2379, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, the crawler attempts to bypass bot protections automatically." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 3266, + "module": "crawlers._basic._basic_crawler", + "name": "retry_on_blocked", + "parsedDocstring": { + "text": "If True, the crawler attempts to bypass bot protections automatically." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 151 + } + ], + "type": { + "name": "NotRequired[bool]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.retry_on_blocked", + "target": 2380, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Settings to fine-tune concurrency levels." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 3267, + "module": "crawlers._basic._basic_crawler", + "name": "concurrency_settings", + "parsedDocstring": { + "text": "Settings to fine-tune concurrency levels." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 154 + } + ], + "type": { + "name": "NotRequired[ConcurrencySettings]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.concurrency_settings", + "target": 2381, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum duration allowed for a single request handler to run." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 3268, + "module": "crawlers._basic._basic_crawler", + "name": "request_handler_timeout", + "parsedDocstring": { + "text": "Maximum duration allowed for a single request handler to run." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 157 + } + ], + "type": { + "name": "NotRequired[timedelta]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.request_handler_timeout", + "target": 2382, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, the crawler stops immediately when any request handler error occurs." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 3269, + "module": "crawlers._basic._basic_crawler", + "name": "abort_on_error", + "parsedDocstring": { + "text": "If True, the crawler stops immediately when any request handler error occurs." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 160 + } + ], + "type": { + "name": "NotRequired[bool]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.abort_on_error", + "target": 2383, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, the crawler will set up logging infrastructure automatically." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 3270, + "module": "crawlers._basic._basic_crawler", + "name": "configure_logging", + "parsedDocstring": { + "text": "If True, the crawler will set up logging infrastructure automatically." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 163 + } + ], + "type": { + "name": "NotRequired[bool]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.configure_logging", + "target": 2384, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If 'table', displays crawler statistics as formatted tables in logs. If 'inline', outputs statistics as plain\ntext log messages." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 3271, + "module": "crawlers._basic._basic_crawler", + "name": "statistics_log_format", + "parsedDocstring": { + "text": "If 'table', displays crawler statistics as formatted tables in logs. If 'inline', outputs statistics as plain\ntext log messages." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 166 + } + ], + "type": { + "name": "NotRequired[Literal['table', 'inline']]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.statistics_log_format", + "target": 2385, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Flag that can keep crawler running even when there are no requests in queue." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 3272, + "module": "crawlers._basic._basic_crawler", + "name": "keep_alive", + "parsedDocstring": { + "text": "Flag that can keep crawler running even when there are no requests in queue." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 171 + } + ], + "type": { + "name": "NotRequired[bool]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.keep_alive", + "target": 2386, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Additional HTTP status codes to treat as errors, triggering automatic retries when encountered." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 3273, + "module": "crawlers._basic._basic_crawler", + "name": "additional_http_error_status_codes", + "parsedDocstring": { + "text": "Additional HTTP status codes to treat as errors, triggering automatic retries when encountered." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 174 + } + ], + "type": { + "name": "NotRequired[Iterable[int]]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.additional_http_error_status_codes", + "target": 2387, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP status codes that are typically considered errors but should be treated as successful responses." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 3274, + "module": "crawlers._basic._basic_crawler", + "name": "ignore_http_error_status_codes", + "parsedDocstring": { + "text": "HTTP status codes that are typically considered errors but should be treated as successful responses." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 177 + } + ], + "type": { + "name": "NotRequired[Iterable[int]]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.ignore_http_error_status_codes", + "target": 2388, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If set to `True`, the crawler will automatically try to fetch the robots.txt file for each domain,\nand skip those that are not allowed. This also prevents disallowed URLs to be added via `EnqueueLinksFunction`." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 3275, + "module": "crawlers._basic._basic_crawler", + "name": "respect_robots_txt_file", + "parsedDocstring": { + "text": "If set to `True`, the crawler will automatically try to fetch the robots.txt file for each domain,\nand skip those that are not allowed. This also prevents disallowed URLs to be added via `EnqueueLinksFunction`." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 188 + } + ], + "type": { + "name": "NotRequired[bool]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.respect_robots_txt_file", + "target": 2389, + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "AbstractHttpCrawler.__init__", + "target": 2667, + "type": "reference" + } + } + ], + "overwrites": { + "name": "AbstractHttpCrawler.__init__", + "target": 2667, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a specific version of `AbstractHttpCrawler` class.\n\nThis is a convenience factory method for creating a specific `AbstractHttpCrawler` subclass.\nWhile `AbstractHttpCrawler` allows its two generic parameters to be independent,\nthis method simplifies cases where `TParseResult` is used for both generic parameters." + } + ] + }, + "decorations": [ + { + "name": "classmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3373, + "module": "crawlers._abstract_http._abstract_http_crawler", + "name": "create_parsed_http_crawler_class", + "parsedDocstring": { + "text": "Create a specific version of `AbstractHttpCrawler` class.\n\nThis is a convenience factory method for creating a specific `AbstractHttpCrawler` subclass.\nWhile `AbstractHttpCrawler` allows its two generic parameters to be independent,\nthis method simplifies cases where `TParseResult` is used for both generic parameters." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 72 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a specific version of `AbstractHttpCrawler` class.\n\nThis is a convenience factory method for creating a specific `AbstractHttpCrawler` subclass.\nWhile `AbstractHttpCrawler` allows its two generic parameters to be independent,\nthis method simplifies cases where `TParseResult` is used for both generic parameters." + } + ] + }, + "flags": {}, + "id": 2672, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "create_parsed_http_crawler_class", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2673, + "kind": 32768, + "kindString": "Parameter", + "name": "static_parser", + "type": { + "name": "AbstractHttpParser[TParseResult, TSelectResult]", + "type": "reference" + } + } + ], + "type": { + "name": "type[AbstractHttpCrawler[ParsedHttpCrawlingContext[TParseResult], TParseResult, TSelectResult]]", + "type": "reference" + }, + "inheritedFrom": { + "name": "AbstractHttpCrawler.create_parsed_http_crawler_class", + "target": 2671, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "AbstractHttpCrawler.create_parsed_http_crawler_class", + "target": 2671, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a hook to be called before each navigation.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3374, + "module": "crawlers._abstract_http._abstract_http_crawler", + "name": "pre_navigation_hook", + "parsedDocstring": { + "text": "Register a hook to be called before each navigation.\n", + "args": { + "hook": "A coroutine function to be called before each navigation." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 264 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a hook to be called before each navigation.\n" + } + ] + }, + "flags": {}, + "id": 2675, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "pre_navigation_hook", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A coroutine function to be called before each navigation." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2676, + "kind": 32768, + "kindString": "Parameter", + "name": "hook", + "type": { + "name": "Callable[[BasicCrawlingContext], Awaitable[None]]", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "AbstractHttpCrawler.pre_navigation_hook", + "target": 2674, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "AbstractHttpCrawler.pre_navigation_hook", + "target": 2674, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The logger used by the crawler." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 3375, + "module": "crawlers._basic._basic_crawler", + "name": "log", + "parsedDocstring": { + "text": "The logger used by the crawler." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 440 + } + ], + "type": { + "name": "logging.Logger", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.log", + "target": 2424, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The `Router` used to handle each individual crawling request." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 3376, + "module": "crawlers._basic._basic_crawler", + "name": "router", + "parsedDocstring": { + "text": "The `Router` used to handle each individual crawling request." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 445 + } + ], + "type": { + "name": "Router[TCrawlingContext]", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.router", + "target": 2425, + "type": "reference" + }, + "overwrites": { + "name": "BasicCrawler.router", + "target": 2426, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Statistics about the current (or last) crawler run." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 3377, + "module": "crawlers._basic._basic_crawler", + "name": "statistics", + "parsedDocstring": { + "text": "Statistics about the current (or last) crawler run." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 460 + } + ], + "type": { + "name": "Statistics[TStatisticsState]", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.statistics", + "target": 2429, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Set flag to stop crawler.\n\nThis stops current crawler run regardless of whether all requests were finished.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3378, + "module": "crawlers._basic._basic_crawler", + "name": "stop", + "parsedDocstring": { + "text": "Set flag to stop crawler.\n\nThis stops current crawler run regardless of whether all requests were finished.\n", + "args": { + "reason": "Reason for stopping that will be used in logs." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 464 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Set flag to stop crawler.\n\nThis stops current crawler run regardless of whether all requests were finished.\n" + } + ] + }, + "flags": {}, + "id": 2431, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "stop", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Reason for stopping that will be used in logs." + } + ] + }, + "defaultValue": "'Stop was called externally.'", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 2432, + "kind": 32768, + "kindString": "Parameter", + "name": "reason", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.stop", + "target": 2430, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.stop", + "target": 2430, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the configured request manager. If none is configured, open and return the default request queue." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3379, + "module": "crawlers._basic._basic_crawler", + "name": "get_request_manager", + "parsedDocstring": { + "text": "Return the configured request manager. If none is configured, open and return the default request queue." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 524 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the configured request manager. If none is configured, open and return the default request queue." + } + ] + }, + "flags": {}, + "id": 2434, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_request_manager", + "parameters": [], + "type": { + "name": "RequestManager", + "type": "reference", + "target": "1752" + }, + "inheritedFrom": { + "name": "BasicCrawler.get_request_manager", + "target": 2433, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.get_request_manager", + "target": 2433, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the `Dataset` with the given ID or name. If none is provided, return the default one." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3380, + "module": "crawlers._basic._basic_crawler", + "name": "get_dataset", + "parsedDocstring": { + "text": "Return the `Dataset` with the given ID or name. If none is provided, return the default one." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 531 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the `Dataset` with the given ID or name. If none is provided, return the default one." + } + ] + }, + "flags": {}, + "id": 2436, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_dataset", + "parameters": [ + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2437, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2438, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str | None", + "type": "reference" + } + } + ], + "type": { + "name": "Dataset", + "type": "reference", + "target": "662" + }, + "inheritedFrom": { + "name": "BasicCrawler.get_dataset", + "target": 2435, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.get_dataset", + "target": 2435, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the `KeyValueStore` with the given ID or name. If none is provided, return the default KVS." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3381, + "module": "crawlers._basic._basic_crawler", + "name": "get_key_value_store", + "parsedDocstring": { + "text": "Return the `KeyValueStore` with the given ID or name. If none is provided, return the default KVS." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 540 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the `KeyValueStore` with the given ID or name. If none is provided, return the default KVS." + } + ] + }, + "flags": {}, + "id": 2440, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_key_value_store", + "parameters": [ + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2441, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2442, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str | None", + "type": "reference" + } + } + ], + "type": { + "name": "KeyValueStore", + "type": "reference", + "target": "569" + }, + "inheritedFrom": { + "name": "BasicCrawler.get_key_value_store", + "target": 2439, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.get_key_value_store", + "target": 2439, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a function to handle errors occurring in request handlers.\n\nThe error handler is invoked after a request handler error occurs and before a retry attempt." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3382, + "module": "crawlers._basic._basic_crawler", + "name": "error_handler", + "parsedDocstring": { + "text": "Register a function to handle errors occurring in request handlers.\n\nThe error handler is invoked after a request handler error occurs and before a retry attempt." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 549 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a function to handle errors occurring in request handlers.\n\nThe error handler is invoked after a request handler error occurs and before a retry attempt." + } + ] + }, + "flags": {}, + "id": 2444, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "error_handler", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2445, + "kind": 32768, + "kindString": "Parameter", + "name": "handler", + "type": { + "name": "ErrorHandler[TCrawlingContext | BasicCrawlingContext]", + "type": "reference" + } + } + ], + "type": { + "name": "ErrorHandler[TCrawlingContext]", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.error_handler", + "target": 2443, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.error_handler", + "target": 2443, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a function to handle requests that exceed the maximum retry limit.\n\nThe failed request handler is invoked when a request has failed all retry attempts." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3383, + "module": "crawlers._basic._basic_crawler", + "name": "failed_request_handler", + "parsedDocstring": { + "text": "Register a function to handle requests that exceed the maximum retry limit.\n\nThe failed request handler is invoked when a request has failed all retry attempts." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 559 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a function to handle requests that exceed the maximum retry limit.\n\nThe failed request handler is invoked when a request has failed all retry attempts." + } + ] + }, + "flags": {}, + "id": 2447, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "failed_request_handler", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2448, + "kind": 32768, + "kindString": "Parameter", + "name": "handler", + "type": { + "name": "FailedRequestHandler[TCrawlingContext | BasicCrawlingContext]", + "type": "reference" + } + } + ], + "type": { + "name": "FailedRequestHandler[TCrawlingContext]", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.failed_request_handler", + "target": 2446, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.failed_request_handler", + "target": 2446, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a function to handle skipped requests.\n\nThe skipped request handler is invoked when a request is skipped due to a collision or other reasons." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3384, + "module": "crawlers._basic._basic_crawler", + "name": "on_skipped_request", + "parsedDocstring": { + "text": "Register a function to handle skipped requests.\n\nThe skipped request handler is invoked when a request is skipped due to a collision or other reasons." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 569 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a function to handle skipped requests.\n\nThe skipped request handler is invoked when a request is skipped due to a collision or other reasons." + } + ] + }, + "flags": {}, + "id": 2450, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "on_skipped_request", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2451, + "kind": 32768, + "kindString": "Parameter", + "name": "callback", + "type": { + "name": "SkippedRequestCallback", + "type": "reference", + "target": "2366" + } + } + ], + "type": { + "name": "SkippedRequestCallback", + "type": "reference", + "target": "2366" + }, + "inheritedFrom": { + "name": "BasicCrawler.on_skipped_request", + "target": 2449, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.on_skipped_request", + "target": 2449, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Run the crawler until all requests are processed.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3385, + "module": "crawlers._basic._basic_crawler", + "name": "run", + "parsedDocstring": { + "text": "Run the crawler until all requests are processed.\n", + "args": { + "requests": "The requests to be enqueued before the crawler starts.", + "purge_request_queue": "If this is `True` and the crawler is not being run for the first time, the default\nrequest queue will be purged." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 577 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Run the crawler until all requests are processed.\n" + } + ] + }, + "flags": {}, + "id": 2453, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "run", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The requests to be enqueued before the crawler starts." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 2454, + "kind": 32768, + "kindString": "Parameter", + "name": "requests", + "type": { + "name": "Sequence[str | Request] | None", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "If this is `True` and the crawler is not being run for the first time, the default\nrequest queue will be purged." + } + ] + }, + "defaultValue": "True", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2455, + "kind": 32768, + "kindString": "Parameter", + "name": "purge_request_queue", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "FinalStatistics", + "type": "reference", + "target": "1453" + }, + "inheritedFrom": { + "name": "BasicCrawler.run", + "target": 2452, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.run", + "target": 2452, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Add requests to the underlying request manager in batches.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3386, + "module": "crawlers._basic._basic_crawler", + "name": "add_requests", + "parsedDocstring": { + "text": "Add requests to the underlying request manager in batches.\n", + "args": { + "requests": "A list of requests to add to the queue.", + "batch_size": "The number of requests to add in one batch.", + "wait_time_between_batches": "Time to wait between adding batches.", + "wait_for_all_requests_to_be_added": "If True, wait for all requests to be added before returning.", + "wait_for_all_requests_to_be_added_timeout": "Timeout for waiting for all requests to be added." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 685 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Add requests to the underlying request manager in batches.\n" + } + ] + }, + "flags": {}, + "id": 2457, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "add_requests", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A list of requests to add to the queue." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2458, + "kind": 32768, + "kindString": "Parameter", + "name": "requests", + "type": { + "name": "Sequence[str | Request]", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The number of requests to add in one batch." + } + ] + }, + "defaultValue": "1000", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2459, + "kind": 32768, + "kindString": "Parameter", + "name": "batch_size", + "type": { + "name": "int", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Time to wait between adding batches." + } + ] + }, + "defaultValue": "timedelta(0)", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2460, + "kind": 32768, + "kindString": "Parameter", + "name": "wait_time_between_batches", + "type": { + "name": "timedelta", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, wait for all requests to be added before returning." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2461, + "kind": 32768, + "kindString": "Parameter", + "name": "wait_for_all_requests_to_be_added", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Timeout for waiting for all requests to be added." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2462, + "kind": 32768, + "kindString": "Parameter", + "name": "wait_for_all_requests_to_be_added_timeout", + "type": { + "name": "timedelta | None", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.add_requests", + "target": 2456, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.add_requests", + "target": 2456, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Retrieve data from a `Dataset`.\n\nThis helper method simplifies the process of retrieving data from a `Dataset`. It opens the specified\none and then retrieves the data based on the provided parameters.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3387, + "module": "crawlers._basic._basic_crawler", + "name": "get_data", + "parsedDocstring": { + "text": "Retrieve data from a `Dataset`.\n\nThis helper method simplifies the process of retrieving data from a `Dataset`. It opens the specified\none and then retrieves the data based on the provided parameters.\n", + "args": { + "dataset_id": "The ID of the `Dataset`.", + "dataset_name": "The name of the `Dataset`.", + "kwargs": "Keyword arguments to be passed to the `Dataset.get_data()` method.\n" + }, + "returns": "The retrieved data." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 738 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "The retrieved data." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Retrieve data from a `Dataset`.\n\nThis helper method simplifies the process of retrieving data from a `Dataset`. It opens the specified\none and then retrieves the data based on the provided parameters.\n" + } + ] + }, + "flags": {}, + "id": 2464, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_data", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The ID of the `Dataset`." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 2465, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_id", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The name of the `Dataset`." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 2466, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_name", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Keyword arguments to be passed to the `Dataset.get_data()` method.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2467, + "kind": 32768, + "kindString": "Parameter", + "name": "kwargs", + "type": { + "name": "Unpack[GetDataKwargs]", + "type": "reference" + } + } + ], + "type": { + "name": "DatasetItemsListPage", + "type": "reference", + "target": "823" + }, + "inheritedFrom": { + "name": "BasicCrawler.get_data", + "target": 2463, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.get_data", + "target": 2463, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Export data from a `Dataset`.\n\nThis helper method simplifies the process of exporting data from a `Dataset`. It opens the specified\none and then exports the data based on the provided parameters. If you need to pass options\nspecific to the output format, use the `export_data_csv` or `export_data_json` method instead.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3388, + "module": "crawlers._basic._basic_crawler", + "name": "export_data", + "parsedDocstring": { + "text": "Export data from a `Dataset`.\n\nThis helper method simplifies the process of exporting data from a `Dataset`. It opens the specified\none and then exports the data based on the provided parameters. If you need to pass options\nspecific to the output format, use the `export_data_csv` or `export_data_json` method instead.\n", + "args": { + "path": "The destination path.", + "dataset_id": "The ID of the `Dataset`.", + "dataset_name": "The name of the `Dataset`." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 760 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Export data from a `Dataset`.\n\nThis helper method simplifies the process of exporting data from a `Dataset`. It opens the specified\none and then exports the data based on the provided parameters. If you need to pass options\nspecific to the output format, use the `export_data_csv` or `export_data_json` method instead.\n" + } + ] + }, + "flags": {}, + "id": 2469, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "export_data", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The destination path." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2470, + "kind": 32768, + "kindString": "Parameter", + "name": "path", + "type": { + "name": "str | Path", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The ID of the `Dataset`." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 2471, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_id", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The name of the `Dataset`." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 2472, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_name", + "type": { + "name": "str | None", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.export_data", + "target": 2468, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.export_data", + "target": 2468, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Export data from a `Dataset` to a CSV file.\n\nThis helper method simplifies the process of exporting data from a `Dataset` in csv format. It opens\nthe specified one and then exports the data based on the provided parameters.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3389, + "module": "crawlers._basic._basic_crawler", + "name": "export_data_csv", + "parsedDocstring": { + "text": "Export data from a `Dataset` to a CSV file.\n\nThis helper method simplifies the process of exporting data from a `Dataset` in csv format. It opens\nthe specified one and then exports the data based on the provided parameters.\n", + "args": { + "path": "The destination path.", + "content_type": "The output format.", + "dataset_id": "The ID of the `Dataset`.", + "dataset_name": "The name of the `Dataset`.", + "kwargs": "Extra configurations for dumping/writing in csv format." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 789 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Export data from a `Dataset` to a CSV file.\n\nThis helper method simplifies the process of exporting data from a `Dataset` in csv format. It opens\nthe specified one and then exports the data based on the provided parameters.\n" + } + ] + }, + "flags": {}, + "id": 2474, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "export_data_csv", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The destination path." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2475, + "kind": 32768, + "kindString": "Parameter", + "name": "path", + "type": { + "name": "str | Path", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The ID of the `Dataset`." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2476, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_id", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The name of the `Dataset`." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2477, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_name", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Extra configurations for dumping/writing in csv format." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2478, + "kind": 32768, + "kindString": "Parameter", + "name": "kwargs", + "type": { + "name": "Unpack[ExportDataCsvKwargs]", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.export_data_csv", + "target": 2473, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.export_data_csv", + "target": 2473, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Export data from a `Dataset` to a JSON file.\n\nThis helper method simplifies the process of exporting data from a `Dataset` in json format. It opens the\nspecified one and then exports the data based on the provided parameters.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3390, + "module": "crawlers._basic._basic_crawler", + "name": "export_data_json", + "parsedDocstring": { + "text": "Export data from a `Dataset` to a JSON file.\n\nThis helper method simplifies the process of exporting data from a `Dataset` in json format. It opens the\nspecified one and then exports the data based on the provided parameters.\n", + "args": { + "path": "The destination path", + "dataset_id": "The ID of the `Dataset`.", + "dataset_name": "The name of the `Dataset`.", + "kwargs": "Extra configurations for dumping/writing in json format." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 814 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Export data from a `Dataset` to a JSON file.\n\nThis helper method simplifies the process of exporting data from a `Dataset` in json format. It opens the\nspecified one and then exports the data based on the provided parameters.\n" + } + ] + }, + "flags": {}, + "id": 2480, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "export_data_json", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The destination path" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2481, + "kind": 32768, + "kindString": "Parameter", + "name": "path", + "type": { + "name": "str | Path", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The ID of the `Dataset`." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2482, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_id", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The name of the `Dataset`." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2483, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_name", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Extra configurations for dumping/writing in json format." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2484, + "kind": 32768, + "kindString": "Parameter", + "name": "kwargs", + "type": { + "name": "Unpack[ExportDataJsonKwargs]", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.export_data_json", + "target": 2479, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.export_data_json", + "target": 2479, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Specific version of generic `AbstractHttpCrawler`.\n\nIt uses a dummy parser that simply returns the HTTP response body as-is. Use this only if you know what you are\ndoing. In most cases, using an HTML parser would be more beneficial. For such scenarios, consider using\n`BeautifulSoupCrawler`, `ParselCrawler`, or writing your own subclass of `AbstractHttpCrawler`.\n\n### Usage\n\n```python\nfrom crawlee.crawlers import HttpCrawler, HttpCrawlingContext\n\ncrawler = HttpCrawler()\n\n# Define the default request handler, which will be called for every request.\n@crawler.router.default_handler\nasync def request_handler(context: HttpCrawlingContext) -> None:\n context.log.info(f'Processing {context.request.url} ...')\n\n # Extract data from the page.\n data = {\n 'url': context.request.url,\n 'response': context.http_response.read().decode()[:100],\n }\n\n # Push the extracted data to the default dataset.\n await context.push_data(data)\n\nawait crawler.run(['https://crawlee.dev/'])\n```" + } + ] + }, + "decorations": [ + { + "args": "('Classes')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 2300, + 3386, + 3373, + 3382, + 3388, + 3389, + 3390, + 3383, + 3387, + 3380, + 3381, + 3379, + 3384, + 3374, + 3385, + 3378 + ], + "title": "Methods" + }, + { + "children": [ + 3375, + 3376, + 3377 + ], + "title": "Properties" + } + ], + "id": 2299, + "module": "crawlers._http._http_crawler", + "name": "HttpCrawler", + "parsedDocstring": { + "text": "Specific version of generic `AbstractHttpCrawler`.\n\nIt uses a dummy parser that simply returns the HTTP response body as-is. Use this only if you know what you are\ndoing. In most cases, using an HTML parser would be more beneficial. For such scenarios, consider using\n`BeautifulSoupCrawler`, `ParselCrawler`, or writing your own subclass of `AbstractHttpCrawler`.\n\n### Usage\n\n```python\nfrom crawlee.crawlers import HttpCrawler, HttpCrawlingContext\n\ncrawler = HttpCrawler()\n\n# Define the default request handler, which will be called for every request.\n@crawler.router.default_handler\nasync def request_handler(context: HttpCrawlingContext) -> None:\n context.log.info(f'Processing {context.request.url} ...')\n\n # Extract data from the page.\n data = {\n 'url': context.request.url,\n 'response': context.http_response.read().decode()[:100],\n }\n\n # Push the extracted data to the default dataset.\n await context.push_data(data)\n\nawait crawler.run(['https://crawlee.dev/'])\n```" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_http/_http_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 17 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "AbstractHttpCrawler", + "target": "2666", + "type": "reference" + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Convert markup string or `BeautifulSoup` to newline separated plain text without tags using BeautifulSoup.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2303, + "module": "crawlers._beautifulsoup._utils", + "name": "html_to_text", + "parsedDocstring": { + "text": "Convert markup string or `BeautifulSoup` to newline separated plain text without tags using BeautifulSoup.\n", + "args": { + "source": "Input markup string or `BeautifulSoup` object.\n" + }, + "returns": "Newline separated plain text without tags." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_beautifulsoup/_utils.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 20 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "Newline separated plain text without tags." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Convert markup string or `BeautifulSoup` to newline separated plain text without tags using BeautifulSoup.\n" + } + ] + }, + "flags": {}, + "id": 2304, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "html_to_text", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Input markup string or `BeautifulSoup` object.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2305, + "kind": 32768, + "kindString": "Parameter", + "name": "source", + "type": { + "name": "str | Tag", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Tag" + } + ] + } + } + ], + "type": { + "name": "str", + "type": "reference" + } + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2307, + "module": "crawlers._beautifulsoup._beautifulsoup_parser", + "name": "__init__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 19 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 2308, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "defaultValue": "'lxml'", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 2309, + "kind": 32768, + "kindString": "Parameter", + "name": "parser", + "type": { + "name": "BeautifulSoupParserType", + "type": "reference", + "target": "2328" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Parse http response.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2310, + "module": "crawlers._abstract_http._abstract_http_parser", + "name": "parse", + "parsedDocstring": { + "text": "Parse http response.\n", + "args": { + "response": "HTTP response to be parsed.\n" + }, + "returns": "Parsed HTTP response." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 23 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "Parsed HTTP response." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Parse http response.\n" + } + ] + }, + "flags": {}, + "id": 2644, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "parse", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP response to be parsed.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2645, + "kind": 32768, + "kindString": "Parameter", + "name": "response", + "type": { + "name": "HttpResponse", + "type": "reference", + "target": "1909" + } + } + ], + "type": { + "name": "TParseResult", + "type": "reference", + "target": "2623" + }, + "overwrites": { + "name": "AbstractHttpParser.parse", + "target": 2643, + "type": "reference" + } + } + ], + "overwrites": { + "name": "AbstractHttpParser.parse", + "target": 2643, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Parse text containing html.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2313, + "module": "crawlers._abstract_http._abstract_http_parser", + "name": "parse_text", + "parsedDocstring": { + "text": "Parse text containing html.\n", + "args": { + "text": "String containing html.\n" + }, + "returns": "Parsed text." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 27 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "Parsed text." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Parse text containing html.\n" + } + ] + }, + "flags": {}, + "id": 2647, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "parse_text", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "String containing html.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2648, + "kind": 32768, + "kindString": "Parameter", + "name": "text", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "TParseResult", + "type": "reference", + "target": "2623" + }, + "overwrites": { + "name": "AbstractHttpParser.parse_text", + "target": 2646, + "type": "reference" + } + } + ], + "overwrites": { + "name": "AbstractHttpParser.parse_text", + "target": 2646, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Find if selector has match in parsed content.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2316, + "module": "crawlers._abstract_http._abstract_http_parser", + "name": "is_matching_selector", + "parsedDocstring": { + "text": "Find if selector has match in parsed content.\n", + "args": { + "parsed_content": "Parsed HTTP response. Result of `parse` method.", + "selector": "String used to define matching pattern.\n" + }, + "returns": "True if selector has match in parsed content." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 31 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "True if selector has match in parsed content." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Find if selector has match in parsed content.\n" + } + ] + }, + "flags": {}, + "id": 2657, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "is_matching_selector", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Parsed HTTP response. Result of `parse` method." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2658, + "kind": 32768, + "kindString": "Parameter", + "name": "parsed_content", + "type": { + "name": "TParseResult", + "type": "reference", + "target": "2623" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "String used to define matching pattern.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2659, + "kind": 32768, + "kindString": "Parameter", + "name": "selector", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "bool", + "type": "reference" + }, + "overwrites": { + "name": "AbstractHttpParser.is_matching_selector", + "target": 2656, + "type": "reference" + } + } + ], + "overwrites": { + "name": "AbstractHttpParser.is_matching_selector", + "target": 2656, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Use css selector to select page element and return it.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2320, + "module": "crawlers._abstract_http._abstract_http_parser", + "name": "select", + "parsedDocstring": { + "text": "Use css selector to select page element and return it.\n", + "args": { + "parsed_content": "Content where the page element will be located.", + "selector": "Css selector used to locate desired html element.\n" + }, + "returns": "Selected element." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 35 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "Selected element." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Use css selector to select page element and return it.\n" + } + ] + }, + "flags": {}, + "id": 2650, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "select", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Content where the page element will be located." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2651, + "kind": 32768, + "kindString": "Parameter", + "name": "parsed_content", + "type": { + "name": "TParseResult", + "type": "reference", + "target": "2623" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Css selector used to locate desired html element.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2652, + "kind": 32768, + "kindString": "Parameter", + "name": "selector", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "Sequence", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "TSelectResult", + "target": "2624" + } + ] + }, + "overwrites": { + "name": "AbstractHttpParser.select", + "target": 2649, + "type": "reference" + } + } + ], + "overwrites": { + "name": "AbstractHttpParser.select", + "target": 2649, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Find all links in result using selector.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2324, + "module": "crawlers._abstract_http._abstract_http_parser", + "name": "find_links", + "parsedDocstring": { + "text": "Find all links in result using selector.\n", + "args": { + "parsed_content": "Parsed HTTP response. Result of `parse` method.", + "selector": "String used to define matching pattern for finding links.\n" + }, + "returns": "Iterable of strings that contain found links." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 39 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "Iterable of strings that contain found links." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Find all links in result using selector.\n" + } + ] + }, + "flags": {}, + "id": 2661, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "find_links", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Parsed HTTP response. Result of `parse` method." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2662, + "kind": 32768, + "kindString": "Parameter", + "name": "parsed_content", + "type": { + "name": "TParseResult", + "type": "reference", + "target": "2623" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "String used to define matching pattern for finding links.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2663, + "kind": 32768, + "kindString": "Parameter", + "name": "selector", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "Iterable", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + }, + "overwrites": { + "name": "AbstractHttpParser.find_links", + "target": 2660, + "type": "reference" + } + } + ], + "overwrites": { + "name": "AbstractHttpParser.find_links", + "target": 2660, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Detect if blocked and return BlockedInfo with additional information.\n\nDefault implementation that expects `is_matching_selector` abstract method to be implemented.\nOverride this method if your parser has different way of blockage detection.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3277, + "module": "crawlers._abstract_http._abstract_http_parser", + "name": "is_blocked", + "parsedDocstring": { + "text": "Detect if blocked and return BlockedInfo with additional information.\n\nDefault implementation that expects `is_matching_selector` abstract method to be implemented.\nOverride this method if your parser has different way of blockage detection.\n", + "args": { + "parsed_content": "Parsed HTTP response. Result of `parse` method.\n" + }, + "returns": "`BlockedInfo` object that contains non-empty string description of reason if blockage was detected. Empty\nstring in reason signifies no blockage detected." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_abstract_http/_abstract_http_parser.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 56 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "`BlockedInfo` object that contains non-empty string description of reason if blockage was detected. Empty\nstring in reason signifies no blockage detected." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Detect if blocked and return BlockedInfo with additional information.\n\nDefault implementation that expects `is_matching_selector` abstract method to be implemented.\nOverride this method if your parser has different way of blockage detection.\n" + } + ] + }, + "flags": {}, + "id": 2654, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "is_blocked", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Parsed HTTP response. Result of `parse` method.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2655, + "kind": 32768, + "kindString": "Parameter", + "name": "parsed_content", + "type": { + "name": "TParseResult", + "type": "reference", + "target": "2623" + } + } + ], + "type": { + "name": "BlockedInfo", + "type": "reference", + "target": "2150" + }, + "inheritedFrom": { + "name": "AbstractHttpParser.is_blocked", + "target": 2653, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "AbstractHttpParser.is_blocked", + "target": 2653, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Parser for parsing HTTP response using `BeautifulSoup`." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 2307, + 2324, + 3277, + 2316, + 2310, + 2313, + 2320 + ], + "title": "Methods" + } + ], + "id": 2306, + "module": "crawlers._beautifulsoup._beautifulsoup_parser", + "name": "BeautifulSoupParser", + "parsedDocstring": { + "text": "Parser for parsing HTTP response using `BeautifulSoup`." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 16 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "AbstractHttpParser", + "target": "2642", + "type": "reference" + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2328, + "module": "crawlers._beautifulsoup._beautifulsoup_parser", + "name": "BeautifulSoupParserType", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 49 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Convenience alias." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 2330, + "module": "crawlers._beautifulsoup._beautifulsoup_crawling_context", + "name": "soup", + "parsedDocstring": { + "text": "Convenience alias." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 21 + } + ], + "type": { + "name": "BeautifulSoup", + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance from an existing `ParsedHttpCrawlingContext`." + } + ] + }, + "decorations": [ + { + "name": "classmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2331, + "module": "crawlers._beautifulsoup._beautifulsoup_crawling_context", + "name": "from_parsed_http_crawling_context", + "parsedDocstring": { + "text": "Initialize a new instance from an existing `ParsedHttpCrawlingContext`." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 26 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance from an existing `ParsedHttpCrawlingContext`." + } + ] + }, + "flags": {}, + "id": 2332, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "from_parsed_http_crawling_context", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2333, + "kind": 32768, + "kindString": "Parameter", + "name": "context", + "type": { + "name": "ParsedHttpCrawlingContext", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "BeautifulSoup" + } + ], + "target": "2632" + } + } + ], + "type": { + "name": "Self", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Convert the parsed HTML content to newline-separated plain text without tags." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2334, + "module": "crawlers._beautifulsoup._beautifulsoup_crawling_context", + "name": "html_to_text", + "parsedDocstring": { + "text": "Convert the parsed HTML content to newline-separated plain text without tags." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 30 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Convert the parsed HTML content to newline-separated plain text without tags." + } + ] + }, + "flags": {}, + "id": 2335, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "html_to_text", + "parameters": [], + "type": { + "name": "str", + "type": "reference" + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3470, + "module": "crawlers._abstract_http._http_crawling_context", + "name": "parsed_content", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_abstract_http/_http_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 40 + } + ], + "type": { + "name": "TParseResult", + "type": "reference", + "target": "2623" + }, + "inheritedFrom": { + "name": "ParsedHttpCrawlingContext.parsed_content", + "target": 2633, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3471, + "module": "crawlers._abstract_http._http_crawling_context", + "name": "enqueue_links", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_abstract_http/_http_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 41 + } + ], + "type": { + "name": "EnqueueLinksFunction", + "type": "reference", + "target": "235" + }, + "inheritedFrom": { + "name": "ParsedHttpCrawlingContext.enqueue_links", + "target": 2634, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3472, + "module": "crawlers._abstract_http._http_crawling_context", + "name": "extract_links", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_abstract_http/_http_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 42 + } + ], + "type": { + "name": "ExtractLinksFunction", + "type": "reference", + "target": "253" + }, + "inheritedFrom": { + "name": "ParsedHttpCrawlingContext.extract_links", + "target": 2635, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance from an existing `HttpCrawlingContext`." + } + ] + }, + "decorations": [ + { + "name": "classmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3473, + "module": "crawlers._abstract_http._http_crawling_context", + "name": "from_http_crawling_context", + "parsedDocstring": { + "text": "Initialize a new instance from an existing `HttpCrawlingContext`." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_abstract_http/_http_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 45 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance from an existing `HttpCrawlingContext`." + } + ] + }, + "flags": {}, + "id": 2637, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "from_http_crawling_context", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2638, + "kind": 32768, + "kindString": "Parameter", + "name": "context", + "type": { + "name": "HttpCrawlingContext", + "type": "reference", + "target": "2625" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2639, + "kind": 32768, + "kindString": "Parameter", + "name": "parsed_content", + "type": { + "name": "TParseResult", + "type": "reference", + "target": "2623" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2640, + "kind": 32768, + "kindString": "Parameter", + "name": "enqueue_links", + "type": { + "name": "EnqueueLinksFunction", + "type": "reference", + "target": "235" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2641, + "kind": 32768, + "kindString": "Parameter", + "name": "extract_links", + "type": { + "name": "ExtractLinksFunction", + "type": "reference", + "target": "253" + } + } + ], + "type": { + "name": "Self", + "type": "reference" + }, + "inheritedFrom": { + "name": "ParsedHttpCrawlingContext.from_http_crawling_context", + "target": 2636, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "ParsedHttpCrawlingContext.from_http_crawling_context", + "target": 2636, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance from an existing `BasicCrawlingContext`." + } + ] + }, + "decorations": [ + { + "name": "classmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3474, + "module": "crawlers._abstract_http._http_crawling_context", + "name": "from_basic_crawling_context", + "parsedDocstring": { + "text": "Initialize a new instance from an existing `BasicCrawlingContext`." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_abstract_http/_http_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 22 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance from an existing `BasicCrawlingContext`." + } + ] + }, + "flags": {}, + "id": 2627, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "from_basic_crawling_context", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2628, + "kind": 32768, + "kindString": "Parameter", + "name": "context", + "type": { + "name": "BasicCrawlingContext", + "type": "reference", + "target": "309" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2629, + "kind": 32768, + "kindString": "Parameter", + "name": "http_response", + "type": { + "name": "HttpResponse", + "type": "reference", + "target": "1909" + } + } + ], + "type": { + "name": "Self", + "type": "reference" + }, + "inheritedFrom": { + "name": "HttpCrawlingContext.from_basic_crawling_context", + "target": 2626, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "HttpCrawlingContext.from_basic_crawling_context", + "target": 2626, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get snapshot of crawled page." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3475, + "module": "crawlers._abstract_http._http_crawling_context", + "name": "get_snapshot", + "parsedDocstring": { + "text": "Get snapshot of crawled page." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_abstract_http/_http_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 27 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get snapshot of crawled page." + } + ] + }, + "flags": {}, + "id": 2631, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_snapshot", + "parameters": [], + "type": { + "name": "PageSnapshot", + "type": "reference", + "target": "297" + }, + "overwrites": { + "name": "BasicCrawlingContext.get_snapshot", + "target": 319, + "type": "reference" + }, + "inheritedFrom": { + "name": "HttpCrawlingContext.get_snapshot", + "target": 2630, + "type": "reference" + } + } + ], + "overwrites": { + "name": "BasicCrawlingContext.get_snapshot", + "target": 319, + "type": "reference" + }, + "inheritedFrom": { + "name": "HttpCrawlingContext.get_snapshot", + "target": 2630, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The HTTP response received from the server." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3476, + "module": "http_clients._base", + "name": "http_response", + "parsedDocstring": { + "text": "The HTTP response received from the server." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/http_clients/_base.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 70 + } + ], + "type": { + "name": "HttpResponse", + "type": "reference", + "target": "1909" + }, + "inheritedFrom": { + "name": "HttpCrawlingResult.http_response", + "target": 1918, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Request object for the current page being processed." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3477, + "module": "_types", + "name": "request", + "parsedDocstring": { + "text": "Request object for the current page being processed." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 627 + } + ], + "type": { + "name": "Request", + "type": "reference", + "target": "398" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.request", + "target": 310, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Session object for the current page being processed." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3478, + "module": "_types", + "name": "session", + "parsedDocstring": { + "text": "Session object for the current page being processed." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 630 + } + ], + "type": { + "name": "Session | None", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.session", + "target": 311, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Proxy information for the current page being processed." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3479, + "module": "_types", + "name": "proxy_info", + "parsedDocstring": { + "text": "Proxy information for the current page being processed." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 633 + } + ], + "type": { + "name": "ProxyInfo | None", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.proxy_info", + "target": 312, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Send request crawling context helper function." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3480, + "module": "_types", + "name": "send_request", + "parsedDocstring": { + "text": "Send request crawling context helper function." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 636 + } + ], + "type": { + "name": "SendRequestFunction", + "type": "reference", + "target": "290" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.send_request", + "target": 313, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Add requests crawling context helper function." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3481, + "module": "_types", + "name": "add_requests", + "parsedDocstring": { + "text": "Add requests crawling context helper function." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 639 + } + ], + "type": { + "name": "AddRequestsFunction", + "type": "reference", + "target": "230" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.add_requests", + "target": 314, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Push data crawling context helper function." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3482, + "module": "_types", + "name": "push_data", + "parsedDocstring": { + "text": "Push data crawling context helper function." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 642 + } + ], + "type": { + "name": "PushDataFunction", + "type": "reference", + "target": "283" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.push_data", + "target": 315, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Use state crawling context helper function." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3483, + "module": "_types", + "name": "use_state", + "parsedDocstring": { + "text": "Use state crawling context helper function." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 645 + } + ], + "type": { + "name": "UseStateFunction", + "type": "reference", + "target": "305" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.use_state", + "target": 316, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get key-value store crawling context helper function." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3484, + "module": "_types", + "name": "get_key_value_store", + "parsedDocstring": { + "text": "Get key-value store crawling context helper function." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 648 + } + ], + "type": { + "name": "GetKeyValueStoreFromRequestHandlerFunction", + "type": "reference", + "target": "278" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.get_key_value_store", + "target": 317, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Logger instance." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3485, + "module": "_types", + "name": "log", + "parsedDocstring": { + "text": "Logger instance." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 651 + } + ], + "type": { + "name": "logging.Logger", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.log", + "target": 318, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return hash of the context. Each context is considered unique." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3486, + "module": "_types", + "name": "__hash__", + "parsedDocstring": { + "text": "Return hash of the context. Each context is considered unique." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 658 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return hash of the context. Each context is considered unique." + } + ] + }, + "flags": {}, + "id": 322, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__hash__", + "parameters": [], + "type": { + "name": "int", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.__hash__", + "target": 321, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawlingContext.__hash__", + "target": 321, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The crawling context used by the `BeautifulSoupCrawler`.\n\nIt provides access to key objects as well as utility functions for handling crawling tasks." + } + ] + }, + "decorations": [ + { + "args": "(frozen=True)", + "name": "dataclass" + }, + { + "args": "('Data structures')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 3486, + 3474, + 3473, + 2331, + 3475, + 2334 + ], + "title": "Methods" + }, + { + "children": [ + 3481, + 3471, + 3472, + 3484, + 3476, + 3485, + 3470, + 3479, + 3482, + 3477, + 3480, + 3478, + 2330, + 3483 + ], + "title": "Properties" + } + ], + "id": 2329, + "module": "crawlers._beautifulsoup._beautifulsoup_crawling_context", + "name": "BeautifulSoupCrawlingContext", + "parsedDocstring": { + "text": "The crawling context used by the `BeautifulSoupCrawler`.\n\nIt provides access to key objects as well as utility functions for handling crawling tasks." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 14 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "ParsedHttpCrawlingContext", + "target": "2632", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2337, + "module": "crawlers._beautifulsoup._beautifulsoup_crawler", + "name": "__init__", + "parsedDocstring": { + "text": "Initialize a new instance.\n", + "args": { + "parser": "The type of parser that should be used by `BeautifulSoup`.", + "kwargs": "Additional keyword arguments to pass to the underlying `AbstractHttpCrawler`." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 57 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n" + } + ] + }, + "flags": {}, + "id": 2338, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The type of parser that should be used by `BeautifulSoup`." + } + ] + }, + "defaultValue": "'lxml'", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2339, + "kind": 32768, + "kindString": "Parameter", + "name": "parser", + "type": { + "name": "BeautifulSoupParserType", + "type": "reference", + "target": "2328" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A callable responsible for handling requests." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 3252, + "module": "crawlers._basic._basic_crawler", + "name": "request_handler", + "parsedDocstring": { + "text": "A callable responsible for handling requests." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 196 + } + ], + "type": { + "name": "NotRequired[Callable[[TCrawlingContext], Awaitable[None]]]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptionsGeneric.request_handler", + "target": 2391, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A custom `Statistics` instance, allowing the use of non-default configuration." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 3253, + "module": "crawlers._basic._basic_crawler", + "name": "statistics", + "parsedDocstring": { + "text": "A custom `Statistics` instance, allowing the use of non-default configuration." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 203 + } + ], + "type": { + "name": "NotRequired[Statistics[TStatisticsState]]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptionsGeneric.statistics", + "target": 2392, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The `Configuration` instance. Some of its properties are used as defaults for the crawler." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 3254, + "module": "crawlers._basic._basic_crawler", + "name": "configuration", + "parsedDocstring": { + "text": "The `Configuration` instance. Some of its properties are used as defaults for the crawler." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 101 + } + ], + "type": { + "name": "NotRequired[Configuration]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.configuration", + "target": 2368, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The event manager for managing events for the crawler and all its components." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 3255, + "module": "crawlers._basic._basic_crawler", + "name": "event_manager", + "parsedDocstring": { + "text": "The event manager for managing events for the crawler and all its components." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 104 + } + ], + "type": { + "name": "NotRequired[EventManager]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.event_manager", + "target": 2369, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The storage client for managing storages for the crawler and all its components." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 3256, + "module": "crawlers._basic._basic_crawler", + "name": "storage_client", + "parsedDocstring": { + "text": "The storage client for managing storages for the crawler and all its components." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 107 + } + ], + "type": { + "name": "NotRequired[StorageClient]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.storage_client", + "target": 2370, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Manager of requests that should be processed by the crawler." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 3257, + "module": "crawlers._basic._basic_crawler", + "name": "request_manager", + "parsedDocstring": { + "text": "Manager of requests that should be processed by the crawler." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 110 + } + ], + "type": { + "name": "NotRequired[RequestManager]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.request_manager", + "target": 2371, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A custom `SessionPool` instance, allowing the use of non-default configuration." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 3258, + "module": "crawlers._basic._basic_crawler", + "name": "session_pool", + "parsedDocstring": { + "text": "A custom `SessionPool` instance, allowing the use of non-default configuration." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 113 + } + ], + "type": { + "name": "NotRequired[SessionPool]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.session_pool", + "target": 2372, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP proxy configuration used when making requests." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 3259, + "module": "crawlers._basic._basic_crawler", + "name": "proxy_configuration", + "parsedDocstring": { + "text": "HTTP proxy configuration used when making requests." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 116 + } + ], + "type": { + "name": "NotRequired[ProxyConfiguration]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.proxy_configuration", + "target": 2373, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP client used by `BasicCrawlingContext.send_request` method." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 3260, + "module": "crawlers._basic._basic_crawler", + "name": "http_client", + "parsedDocstring": { + "text": "HTTP client used by `BasicCrawlingContext.send_request` method." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 119 + } + ], + "type": { + "name": "NotRequired[HttpClient]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.http_client", + "target": 2374, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Specifies the maximum number of retries allowed for a request if its processing fails.\nThis includes retries due to navigation errors or errors thrown from user-supplied functions\n(`request_handler`, `pre_navigation_hooks` etc.).\n\nThis limit does not apply to retries triggered by session rotation (see `max_session_rotations`)." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 3261, + "module": "crawlers._basic._basic_crawler", + "name": "max_request_retries", + "parsedDocstring": { + "text": "Specifies the maximum number of retries allowed for a request if its processing fails.\nThis includes retries due to navigation errors or errors thrown from user-supplied functions\n(`request_handler`, `pre_navigation_hooks` etc.).\n\nThis limit does not apply to retries triggered by session rotation (see `max_session_rotations`)." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 122 + } + ], + "type": { + "name": "NotRequired[int]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.max_request_retries", + "target": 2375, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum number of pages to open during a crawl. The crawl stops upon reaching this limit.\nSetting this value can help avoid infinite loops in misconfigured crawlers. `None` means no limit.\nDue to concurrency settings, the actual number of pages visited may slightly exceed this value." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 3262, + "module": "crawlers._basic._basic_crawler", + "name": "max_requests_per_crawl", + "parsedDocstring": { + "text": "Maximum number of pages to open during a crawl. The crawl stops upon reaching this limit.\nSetting this value can help avoid infinite loops in misconfigured crawlers. `None` means no limit.\nDue to concurrency settings, the actual number of pages visited may slightly exceed this value." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 129 + } + ], + "type": { + "name": "NotRequired[int | None]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.max_requests_per_crawl", + "target": 2376, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum number of session rotations per request. The crawler rotates the session if a proxy error occurs\nor if the website blocks the request.\n\nThe session rotations are not counted towards the `max_request_retries` limit." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 3263, + "module": "crawlers._basic._basic_crawler", + "name": "max_session_rotations", + "parsedDocstring": { + "text": "Maximum number of session rotations per request. The crawler rotates the session if a proxy error occurs\nor if the website blocks the request.\n\nThe session rotations are not counted towards the `max_request_retries` limit." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 134 + } + ], + "type": { + "name": "NotRequired[int]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.max_session_rotations", + "target": 2377, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Specifies the maximum crawl depth. If set, the crawler will stop processing links beyond this depth.\nThe crawl depth starts at 0 for initial requests and increases with each subsequent level of links.\nRequests at the maximum depth will still be processed, but no new links will be enqueued from those requests.\nIf not set, crawling continues without depth restrictions." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 3264, + "module": "crawlers._basic._basic_crawler", + "name": "max_crawl_depth", + "parsedDocstring": { + "text": "Specifies the maximum crawl depth. If set, the crawler will stop processing links beyond this depth.\nThe crawl depth starts at 0 for initial requests and increases with each subsequent level of links.\nRequests at the maximum depth will still be processed, but no new links will be enqueued from those requests.\nIf not set, crawling continues without depth restrictions." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 141 + } + ], + "type": { + "name": "NotRequired[int | None]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.max_crawl_depth", + "target": 2378, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Enable the use of a session pool for managing sessions during crawling." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 3265, + "module": "crawlers._basic._basic_crawler", + "name": "use_session_pool", + "parsedDocstring": { + "text": "Enable the use of a session pool for managing sessions during crawling." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 148 + } + ], + "type": { + "name": "NotRequired[bool]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.use_session_pool", + "target": 2379, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, the crawler attempts to bypass bot protections automatically." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 3266, + "module": "crawlers._basic._basic_crawler", + "name": "retry_on_blocked", + "parsedDocstring": { + "text": "If True, the crawler attempts to bypass bot protections automatically." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 151 + } + ], + "type": { + "name": "NotRequired[bool]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.retry_on_blocked", + "target": 2380, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Settings to fine-tune concurrency levels." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 3267, + "module": "crawlers._basic._basic_crawler", + "name": "concurrency_settings", + "parsedDocstring": { + "text": "Settings to fine-tune concurrency levels." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 154 + } + ], + "type": { + "name": "NotRequired[ConcurrencySettings]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.concurrency_settings", + "target": 2381, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum duration allowed for a single request handler to run." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 3268, + "module": "crawlers._basic._basic_crawler", + "name": "request_handler_timeout", + "parsedDocstring": { + "text": "Maximum duration allowed for a single request handler to run." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 157 + } + ], + "type": { + "name": "NotRequired[timedelta]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.request_handler_timeout", + "target": 2382, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, the crawler stops immediately when any request handler error occurs." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 3269, + "module": "crawlers._basic._basic_crawler", + "name": "abort_on_error", + "parsedDocstring": { + "text": "If True, the crawler stops immediately when any request handler error occurs." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 160 + } + ], + "type": { + "name": "NotRequired[bool]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.abort_on_error", + "target": 2383, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, the crawler will set up logging infrastructure automatically." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 3270, + "module": "crawlers._basic._basic_crawler", + "name": "configure_logging", + "parsedDocstring": { + "text": "If True, the crawler will set up logging infrastructure automatically." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 163 + } + ], + "type": { + "name": "NotRequired[bool]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.configure_logging", + "target": 2384, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If 'table', displays crawler statistics as formatted tables in logs. If 'inline', outputs statistics as plain\ntext log messages." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 3271, + "module": "crawlers._basic._basic_crawler", + "name": "statistics_log_format", + "parsedDocstring": { + "text": "If 'table', displays crawler statistics as formatted tables in logs. If 'inline', outputs statistics as plain\ntext log messages." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 166 + } + ], + "type": { + "name": "NotRequired[Literal['table', 'inline']]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.statistics_log_format", + "target": 2385, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Flag that can keep crawler running even when there are no requests in queue." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 3272, + "module": "crawlers._basic._basic_crawler", + "name": "keep_alive", + "parsedDocstring": { + "text": "Flag that can keep crawler running even when there are no requests in queue." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 171 + } + ], + "type": { + "name": "NotRequired[bool]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.keep_alive", + "target": 2386, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Additional HTTP status codes to treat as errors, triggering automatic retries when encountered." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 3273, + "module": "crawlers._basic._basic_crawler", + "name": "additional_http_error_status_codes", + "parsedDocstring": { + "text": "Additional HTTP status codes to treat as errors, triggering automatic retries when encountered." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 174 + } + ], + "type": { + "name": "NotRequired[Iterable[int]]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.additional_http_error_status_codes", + "target": 2387, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP status codes that are typically considered errors but should be treated as successful responses." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 3274, + "module": "crawlers._basic._basic_crawler", + "name": "ignore_http_error_status_codes", + "parsedDocstring": { + "text": "HTTP status codes that are typically considered errors but should be treated as successful responses." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 177 + } + ], + "type": { + "name": "NotRequired[Iterable[int]]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.ignore_http_error_status_codes", + "target": 2388, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If set to `True`, the crawler will automatically try to fetch the robots.txt file for each domain,\nand skip those that are not allowed. This also prevents disallowed URLs to be added via `EnqueueLinksFunction`." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 3275, + "module": "crawlers._basic._basic_crawler", + "name": "respect_robots_txt_file", + "parsedDocstring": { + "text": "If set to `True`, the crawler will automatically try to fetch the robots.txt file for each domain,\nand skip those that are not allowed. This also prevents disallowed URLs to be added via `EnqueueLinksFunction`." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 188 + } + ], + "type": { + "name": "NotRequired[bool]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.respect_robots_txt_file", + "target": 2389, + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "AbstractHttpCrawler.__init__", + "target": 2667, + "type": "reference" + } + } + ], + "overwrites": { + "name": "AbstractHttpCrawler.__init__", + "target": 2667, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a specific version of `AbstractHttpCrawler` class.\n\nThis is a convenience factory method for creating a specific `AbstractHttpCrawler` subclass.\nWhile `AbstractHttpCrawler` allows its two generic parameters to be independent,\nthis method simplifies cases where `TParseResult` is used for both generic parameters." + } + ] + }, + "decorations": [ + { + "name": "classmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3391, + "module": "crawlers._abstract_http._abstract_http_crawler", + "name": "create_parsed_http_crawler_class", + "parsedDocstring": { + "text": "Create a specific version of `AbstractHttpCrawler` class.\n\nThis is a convenience factory method for creating a specific `AbstractHttpCrawler` subclass.\nWhile `AbstractHttpCrawler` allows its two generic parameters to be independent,\nthis method simplifies cases where `TParseResult` is used for both generic parameters." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 72 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a specific version of `AbstractHttpCrawler` class.\n\nThis is a convenience factory method for creating a specific `AbstractHttpCrawler` subclass.\nWhile `AbstractHttpCrawler` allows its two generic parameters to be independent,\nthis method simplifies cases where `TParseResult` is used for both generic parameters." + } + ] + }, + "flags": {}, + "id": 2672, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "create_parsed_http_crawler_class", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2673, + "kind": 32768, + "kindString": "Parameter", + "name": "static_parser", + "type": { + "name": "AbstractHttpParser[TParseResult, TSelectResult]", + "type": "reference" + } + } + ], + "type": { + "name": "type[AbstractHttpCrawler[ParsedHttpCrawlingContext[TParseResult], TParseResult, TSelectResult]]", + "type": "reference" + }, + "inheritedFrom": { + "name": "AbstractHttpCrawler.create_parsed_http_crawler_class", + "target": 2671, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "AbstractHttpCrawler.create_parsed_http_crawler_class", + "target": 2671, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a hook to be called before each navigation.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3392, + "module": "crawlers._abstract_http._abstract_http_crawler", + "name": "pre_navigation_hook", + "parsedDocstring": { + "text": "Register a hook to be called before each navigation.\n", + "args": { + "hook": "A coroutine function to be called before each navigation." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 264 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a hook to be called before each navigation.\n" + } + ] + }, + "flags": {}, + "id": 2675, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "pre_navigation_hook", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A coroutine function to be called before each navigation." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2676, + "kind": 32768, + "kindString": "Parameter", + "name": "hook", + "type": { + "name": "Callable[[BasicCrawlingContext], Awaitable[None]]", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "AbstractHttpCrawler.pre_navigation_hook", + "target": 2674, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "AbstractHttpCrawler.pre_navigation_hook", + "target": 2674, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The logger used by the crawler." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 3393, + "module": "crawlers._basic._basic_crawler", + "name": "log", + "parsedDocstring": { + "text": "The logger used by the crawler." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 440 + } + ], + "type": { + "name": "logging.Logger", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.log", + "target": 2424, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The `Router` used to handle each individual crawling request." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 3394, + "module": "crawlers._basic._basic_crawler", + "name": "router", + "parsedDocstring": { + "text": "The `Router` used to handle each individual crawling request." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 445 + } + ], + "type": { + "name": "Router[TCrawlingContext]", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.router", + "target": 2425, + "type": "reference" + }, + "overwrites": { + "name": "BasicCrawler.router", + "target": 2426, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Statistics about the current (or last) crawler run." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 3395, + "module": "crawlers._basic._basic_crawler", + "name": "statistics", + "parsedDocstring": { + "text": "Statistics about the current (or last) crawler run." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 460 + } + ], + "type": { + "name": "Statistics[TStatisticsState]", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.statistics", + "target": 2429, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Set flag to stop crawler.\n\nThis stops current crawler run regardless of whether all requests were finished.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3396, + "module": "crawlers._basic._basic_crawler", + "name": "stop", + "parsedDocstring": { + "text": "Set flag to stop crawler.\n\nThis stops current crawler run regardless of whether all requests were finished.\n", + "args": { + "reason": "Reason for stopping that will be used in logs." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 464 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Set flag to stop crawler.\n\nThis stops current crawler run regardless of whether all requests were finished.\n" + } + ] + }, + "flags": {}, + "id": 2431, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "stop", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Reason for stopping that will be used in logs." + } + ] + }, + "defaultValue": "'Stop was called externally.'", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 2432, + "kind": 32768, + "kindString": "Parameter", + "name": "reason", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.stop", + "target": 2430, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.stop", + "target": 2430, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the configured request manager. If none is configured, open and return the default request queue." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3397, + "module": "crawlers._basic._basic_crawler", + "name": "get_request_manager", + "parsedDocstring": { + "text": "Return the configured request manager. If none is configured, open and return the default request queue." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 524 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the configured request manager. If none is configured, open and return the default request queue." + } + ] + }, + "flags": {}, + "id": 2434, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_request_manager", + "parameters": [], + "type": { + "name": "RequestManager", + "type": "reference", + "target": "1752" + }, + "inheritedFrom": { + "name": "BasicCrawler.get_request_manager", + "target": 2433, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.get_request_manager", + "target": 2433, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the `Dataset` with the given ID or name. If none is provided, return the default one." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3398, + "module": "crawlers._basic._basic_crawler", + "name": "get_dataset", + "parsedDocstring": { + "text": "Return the `Dataset` with the given ID or name. If none is provided, return the default one." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 531 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the `Dataset` with the given ID or name. If none is provided, return the default one." + } + ] + }, + "flags": {}, + "id": 2436, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_dataset", + "parameters": [ + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2437, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2438, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str | None", + "type": "reference" + } + } + ], + "type": { + "name": "Dataset", + "type": "reference", + "target": "662" + }, + "inheritedFrom": { + "name": "BasicCrawler.get_dataset", + "target": 2435, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.get_dataset", + "target": 2435, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the `KeyValueStore` with the given ID or name. If none is provided, return the default KVS." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3399, + "module": "crawlers._basic._basic_crawler", + "name": "get_key_value_store", + "parsedDocstring": { + "text": "Return the `KeyValueStore` with the given ID or name. If none is provided, return the default KVS." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 540 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the `KeyValueStore` with the given ID or name. If none is provided, return the default KVS." + } + ] + }, + "flags": {}, + "id": 2440, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_key_value_store", + "parameters": [ + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2441, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2442, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str | None", + "type": "reference" + } + } + ], + "type": { + "name": "KeyValueStore", + "type": "reference", + "target": "569" + }, + "inheritedFrom": { + "name": "BasicCrawler.get_key_value_store", + "target": 2439, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.get_key_value_store", + "target": 2439, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a function to handle errors occurring in request handlers.\n\nThe error handler is invoked after a request handler error occurs and before a retry attempt." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3400, + "module": "crawlers._basic._basic_crawler", + "name": "error_handler", + "parsedDocstring": { + "text": "Register a function to handle errors occurring in request handlers.\n\nThe error handler is invoked after a request handler error occurs and before a retry attempt." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 549 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a function to handle errors occurring in request handlers.\n\nThe error handler is invoked after a request handler error occurs and before a retry attempt." + } + ] + }, + "flags": {}, + "id": 2444, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "error_handler", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2445, + "kind": 32768, + "kindString": "Parameter", + "name": "handler", + "type": { + "name": "ErrorHandler[TCrawlingContext | BasicCrawlingContext]", + "type": "reference" + } + } + ], + "type": { + "name": "ErrorHandler[TCrawlingContext]", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.error_handler", + "target": 2443, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.error_handler", + "target": 2443, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a function to handle requests that exceed the maximum retry limit.\n\nThe failed request handler is invoked when a request has failed all retry attempts." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3401, + "module": "crawlers._basic._basic_crawler", + "name": "failed_request_handler", + "parsedDocstring": { + "text": "Register a function to handle requests that exceed the maximum retry limit.\n\nThe failed request handler is invoked when a request has failed all retry attempts." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 559 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a function to handle requests that exceed the maximum retry limit.\n\nThe failed request handler is invoked when a request has failed all retry attempts." + } + ] + }, + "flags": {}, + "id": 2447, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "failed_request_handler", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2448, + "kind": 32768, + "kindString": "Parameter", + "name": "handler", + "type": { + "name": "FailedRequestHandler[TCrawlingContext | BasicCrawlingContext]", + "type": "reference" + } + } + ], + "type": { + "name": "FailedRequestHandler[TCrawlingContext]", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.failed_request_handler", + "target": 2446, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.failed_request_handler", + "target": 2446, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a function to handle skipped requests.\n\nThe skipped request handler is invoked when a request is skipped due to a collision or other reasons." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3402, + "module": "crawlers._basic._basic_crawler", + "name": "on_skipped_request", + "parsedDocstring": { + "text": "Register a function to handle skipped requests.\n\nThe skipped request handler is invoked when a request is skipped due to a collision or other reasons." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 569 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a function to handle skipped requests.\n\nThe skipped request handler is invoked when a request is skipped due to a collision or other reasons." + } + ] + }, + "flags": {}, + "id": 2450, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "on_skipped_request", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2451, + "kind": 32768, + "kindString": "Parameter", + "name": "callback", + "type": { + "name": "SkippedRequestCallback", + "type": "reference", + "target": "2366" + } + } + ], + "type": { + "name": "SkippedRequestCallback", + "type": "reference", + "target": "2366" + }, + "inheritedFrom": { + "name": "BasicCrawler.on_skipped_request", + "target": 2449, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.on_skipped_request", + "target": 2449, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Run the crawler until all requests are processed.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3403, + "module": "crawlers._basic._basic_crawler", + "name": "run", + "parsedDocstring": { + "text": "Run the crawler until all requests are processed.\n", + "args": { + "requests": "The requests to be enqueued before the crawler starts.", + "purge_request_queue": "If this is `True` and the crawler is not being run for the first time, the default\nrequest queue will be purged." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 577 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Run the crawler until all requests are processed.\n" + } + ] + }, + "flags": {}, + "id": 2453, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "run", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The requests to be enqueued before the crawler starts." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 2454, + "kind": 32768, + "kindString": "Parameter", + "name": "requests", + "type": { + "name": "Sequence[str | Request] | None", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "If this is `True` and the crawler is not being run for the first time, the default\nrequest queue will be purged." + } + ] + }, + "defaultValue": "True", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2455, + "kind": 32768, + "kindString": "Parameter", + "name": "purge_request_queue", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "FinalStatistics", + "type": "reference", + "target": "1453" + }, + "inheritedFrom": { + "name": "BasicCrawler.run", + "target": 2452, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.run", + "target": 2452, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Add requests to the underlying request manager in batches.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3404, + "module": "crawlers._basic._basic_crawler", + "name": "add_requests", + "parsedDocstring": { + "text": "Add requests to the underlying request manager in batches.\n", + "args": { + "requests": "A list of requests to add to the queue.", + "batch_size": "The number of requests to add in one batch.", + "wait_time_between_batches": "Time to wait between adding batches.", + "wait_for_all_requests_to_be_added": "If True, wait for all requests to be added before returning.", + "wait_for_all_requests_to_be_added_timeout": "Timeout for waiting for all requests to be added." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 685 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Add requests to the underlying request manager in batches.\n" + } + ] + }, + "flags": {}, + "id": 2457, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "add_requests", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A list of requests to add to the queue." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2458, + "kind": 32768, + "kindString": "Parameter", + "name": "requests", + "type": { + "name": "Sequence[str | Request]", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The number of requests to add in one batch." + } + ] + }, + "defaultValue": "1000", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2459, + "kind": 32768, + "kindString": "Parameter", + "name": "batch_size", + "type": { + "name": "int", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Time to wait between adding batches." + } + ] + }, + "defaultValue": "timedelta(0)", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2460, + "kind": 32768, + "kindString": "Parameter", + "name": "wait_time_between_batches", + "type": { + "name": "timedelta", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, wait for all requests to be added before returning." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2461, + "kind": 32768, + "kindString": "Parameter", + "name": "wait_for_all_requests_to_be_added", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Timeout for waiting for all requests to be added." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2462, + "kind": 32768, + "kindString": "Parameter", + "name": "wait_for_all_requests_to_be_added_timeout", + "type": { + "name": "timedelta | None", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.add_requests", + "target": 2456, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.add_requests", + "target": 2456, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Retrieve data from a `Dataset`.\n\nThis helper method simplifies the process of retrieving data from a `Dataset`. It opens the specified\none and then retrieves the data based on the provided parameters.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3405, + "module": "crawlers._basic._basic_crawler", + "name": "get_data", + "parsedDocstring": { + "text": "Retrieve data from a `Dataset`.\n\nThis helper method simplifies the process of retrieving data from a `Dataset`. It opens the specified\none and then retrieves the data based on the provided parameters.\n", + "args": { + "dataset_id": "The ID of the `Dataset`.", + "dataset_name": "The name of the `Dataset`.", + "kwargs": "Keyword arguments to be passed to the `Dataset.get_data()` method.\n" + }, + "returns": "The retrieved data." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 738 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "The retrieved data." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Retrieve data from a `Dataset`.\n\nThis helper method simplifies the process of retrieving data from a `Dataset`. It opens the specified\none and then retrieves the data based on the provided parameters.\n" + } + ] + }, + "flags": {}, + "id": 2464, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_data", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The ID of the `Dataset`." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 2465, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_id", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The name of the `Dataset`." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 2466, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_name", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Keyword arguments to be passed to the `Dataset.get_data()` method.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2467, + "kind": 32768, + "kindString": "Parameter", + "name": "kwargs", + "type": { + "name": "Unpack[GetDataKwargs]", + "type": "reference" + } + } + ], + "type": { + "name": "DatasetItemsListPage", + "type": "reference", + "target": "823" + }, + "inheritedFrom": { + "name": "BasicCrawler.get_data", + "target": 2463, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.get_data", + "target": 2463, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Export data from a `Dataset`.\n\nThis helper method simplifies the process of exporting data from a `Dataset`. It opens the specified\none and then exports the data based on the provided parameters. If you need to pass options\nspecific to the output format, use the `export_data_csv` or `export_data_json` method instead.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3406, + "module": "crawlers._basic._basic_crawler", + "name": "export_data", + "parsedDocstring": { + "text": "Export data from a `Dataset`.\n\nThis helper method simplifies the process of exporting data from a `Dataset`. It opens the specified\none and then exports the data based on the provided parameters. If you need to pass options\nspecific to the output format, use the `export_data_csv` or `export_data_json` method instead.\n", + "args": { + "path": "The destination path.", + "dataset_id": "The ID of the `Dataset`.", + "dataset_name": "The name of the `Dataset`." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 760 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Export data from a `Dataset`.\n\nThis helper method simplifies the process of exporting data from a `Dataset`. It opens the specified\none and then exports the data based on the provided parameters. If you need to pass options\nspecific to the output format, use the `export_data_csv` or `export_data_json` method instead.\n" + } + ] + }, + "flags": {}, + "id": 2469, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "export_data", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The destination path." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2470, + "kind": 32768, + "kindString": "Parameter", + "name": "path", + "type": { + "name": "str | Path", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The ID of the `Dataset`." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 2471, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_id", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The name of the `Dataset`." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 2472, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_name", + "type": { + "name": "str | None", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.export_data", + "target": 2468, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.export_data", + "target": 2468, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Export data from a `Dataset` to a CSV file.\n\nThis helper method simplifies the process of exporting data from a `Dataset` in csv format. It opens\nthe specified one and then exports the data based on the provided parameters.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3407, + "module": "crawlers._basic._basic_crawler", + "name": "export_data_csv", + "parsedDocstring": { + "text": "Export data from a `Dataset` to a CSV file.\n\nThis helper method simplifies the process of exporting data from a `Dataset` in csv format. It opens\nthe specified one and then exports the data based on the provided parameters.\n", + "args": { + "path": "The destination path.", + "content_type": "The output format.", + "dataset_id": "The ID of the `Dataset`.", + "dataset_name": "The name of the `Dataset`.", + "kwargs": "Extra configurations for dumping/writing in csv format." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 789 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Export data from a `Dataset` to a CSV file.\n\nThis helper method simplifies the process of exporting data from a `Dataset` in csv format. It opens\nthe specified one and then exports the data based on the provided parameters.\n" + } + ] + }, + "flags": {}, + "id": 2474, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "export_data_csv", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The destination path." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2475, + "kind": 32768, + "kindString": "Parameter", + "name": "path", + "type": { + "name": "str | Path", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The ID of the `Dataset`." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2476, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_id", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The name of the `Dataset`." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2477, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_name", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Extra configurations for dumping/writing in csv format." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2478, + "kind": 32768, + "kindString": "Parameter", + "name": "kwargs", + "type": { + "name": "Unpack[ExportDataCsvKwargs]", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.export_data_csv", + "target": 2473, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.export_data_csv", + "target": 2473, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Export data from a `Dataset` to a JSON file.\n\nThis helper method simplifies the process of exporting data from a `Dataset` in json format. It opens the\nspecified one and then exports the data based on the provided parameters.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3408, + "module": "crawlers._basic._basic_crawler", + "name": "export_data_json", + "parsedDocstring": { + "text": "Export data from a `Dataset` to a JSON file.\n\nThis helper method simplifies the process of exporting data from a `Dataset` in json format. It opens the\nspecified one and then exports the data based on the provided parameters.\n", + "args": { + "path": "The destination path", + "dataset_id": "The ID of the `Dataset`.", + "dataset_name": "The name of the `Dataset`.", + "kwargs": "Extra configurations for dumping/writing in json format." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 814 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Export data from a `Dataset` to a JSON file.\n\nThis helper method simplifies the process of exporting data from a `Dataset` in json format. It opens the\nspecified one and then exports the data based on the provided parameters.\n" + } + ] + }, + "flags": {}, + "id": 2480, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "export_data_json", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The destination path" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2481, + "kind": 32768, + "kindString": "Parameter", + "name": "path", + "type": { + "name": "str | Path", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The ID of the `Dataset`." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2482, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_id", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The name of the `Dataset`." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2483, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_name", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Extra configurations for dumping/writing in json format." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2484, + "kind": 32768, + "kindString": "Parameter", + "name": "kwargs", + "type": { + "name": "Unpack[ExportDataJsonKwargs]", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.export_data_json", + "target": 2479, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.export_data_json", + "target": 2479, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A web crawler for performing HTTP requests and parsing HTML/XML content.\n\nThe `BeautifulSoupCrawler` builds on top of the `AbstractHttpCrawler`, which means it inherits all of its features.\nIt specifies its own parser `BeautifulSoupParser` which is used to parse `HttpResponse`.\n`BeautifulSoupParser` uses following library for parsing: https://pypi.org/project/beautifulsoup4/\n\nThe HTTP client-based crawlers are ideal for websites that do not require JavaScript execution. However,\nif you need to execute client-side JavaScript, consider using browser-based crawler like the `PlaywrightCrawler`.\n\n### Usage\n\n```python\nfrom crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext\n\ncrawler = BeautifulSoupCrawler()\n\n# Define the default request handler, which will be called for every request.\n@crawler.router.default_handler\nasync def request_handler(context: BeautifulSoupCrawlingContext) -> None:\n context.log.info(f'Processing {context.request.url} ...')\n\n # Extract data from the page.\n data = {\n 'url': context.request.url,\n 'title': context.soup.title.string if context.soup.title else None,\n }\n\n # Push the extracted data to the default dataset.\n await context.push_data(data)\n\nawait crawler.run(['https://crawlee.dev/'])\n```" + } + ] + }, + "decorations": [ + { + "args": "('Classes')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 2337, + 3404, + 3391, + 3400, + 3406, + 3407, + 3408, + 3401, + 3405, + 3398, + 3399, + 3397, + 3402, + 3392, + 3403, + 3396 + ], + "title": "Methods" + }, + { + "children": [ + 3393, + 3394, + 3395 + ], + "title": "Properties" + } + ], + "id": 2336, + "module": "crawlers._beautifulsoup._beautifulsoup_crawler", + "name": "BeautifulSoupCrawler", + "parsedDocstring": { + "text": "A web crawler for performing HTTP requests and parsing HTML/XML content.\n\nThe `BeautifulSoupCrawler` builds on top of the `AbstractHttpCrawler`, which means it inherits all of its features.\nIt specifies its own parser `BeautifulSoupParser` which is used to parse `HttpResponse`.\n`BeautifulSoupParser` uses following library for parsing: https://pypi.org/project/beautifulsoup4/\n\nThe HTTP client-based crawlers are ideal for websites that do not require JavaScript execution. However,\nif you need to execute client-side JavaScript, consider using browser-based crawler like the `PlaywrightCrawler`.\n\n### Usage\n\n```python\nfrom crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext\n\ncrawler = BeautifulSoupCrawler()\n\n# Define the default request handler, which will be called for every request.\n@crawler.router.default_handler\nasync def request_handler(context: BeautifulSoupCrawlingContext) -> None:\n context.log.info(f'Processing {context.request.url} ...')\n\n # Extract data from the page.\n data = {\n 'url': context.request.url,\n 'title': context.soup.title.string if context.soup.title else None,\n }\n\n # Push the extracted data to the default dataset.\n await context.push_data(data)\n\nawait crawler.run(['https://crawlee.dev/'])\n```" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 22 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "AbstractHttpCrawler", + "target": "2666", + "type": "reference" + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2341, + "module": "crawlers._basic._logging_utils", + "name": "reduce_asyncio_timeout_error_to_relevant_traceback_parts", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_logging_utils.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 36 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 2342, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "reduce_asyncio_timeout_error_to_relevant_traceback_parts", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2343, + "kind": 32768, + "kindString": "Parameter", + "name": "timeout_error", + "type": { + "name": "asyncio.exceptions.TimeoutError", + "type": "reference" + } + } + ], + "type": { + "name": "list", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ], + "target": "866" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2344, + "module": "crawlers._basic._logging_utils", + "name": "get_one_line_error_summary_if_possible", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_logging_utils.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 50 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 2345, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "get_one_line_error_summary_if_possible", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2346, + "kind": 32768, + "kindString": "Parameter", + "name": "error", + "type": { + "name": "Exception", + "type": "reference" + } + } + ], + "type": { + "name": "str", + "type": "reference" + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2347, + "module": "crawlers._basic._context_pipeline", + "name": "TCrawlingContext", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_context_pipeline.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 20 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2348, + "module": "crawlers._basic._context_pipeline", + "name": "TMiddlewareCrawlingContext", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_context_pipeline.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 21 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2350, + "module": "crawlers._basic._context_pipeline", + "name": "__init__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_context_pipeline.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 31 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 2351, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2352, + "kind": 32768, + "kindString": "Parameter", + "name": "_middleware", + "type": { + "name": "Callable[ [TCrawlingContext], AsyncGenerator[TMiddlewareCrawlingContext, Exception | None], ] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Callable", + "typeArguments": [ + { + "type": "reference", + "name": "[TCrawlingContext]" + }, + { + "type": "reference", + "name": "AsyncGenerator", + "typeArguments": [ + { + "type": "reference", + "name": "TMiddlewareCrawlingContext", + "target": "2348" + }, + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "Exception" + }, + { + "type": "literal", + "value": null + } + ] + } + ] + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2353, + "kind": 32768, + "kindString": "Parameter", + "name": "_parent", + "type": { + "name": "ContextPipeline[BasicCrawlingContext] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ContextPipeline", + "typeArguments": [ + { + "type": "reference", + "name": "BasicCrawlingContext", + "target": "309" + } + ], + "target": "2349" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Run a crawling context through the middleware chain and pipe it into a consumer function.\n\nExceptions from the consumer function are wrapped together with the final crawling context." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2354, + "module": "crawlers._basic._context_pipeline", + "name": "__call__", + "parsedDocstring": { + "text": "Run a crawling context through the middleware chain and pipe it into a consumer function.\n\nExceptions from the consumer function are wrapped together with the final crawling context." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_context_pipeline.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 50 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Run a crawling context through the middleware chain and pipe it into a consumer function.\n\nExceptions from the consumer function are wrapped together with the final crawling context." + } + ] + }, + "flags": {}, + "id": 2355, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "__call__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2356, + "kind": 32768, + "kindString": "Parameter", + "name": "crawling_context", + "type": { + "name": "BasicCrawlingContext", + "type": "reference", + "target": "309" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2357, + "kind": 32768, + "kindString": "Parameter", + "name": "final_context_consumer", + "type": { + "name": "Callable", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "[TCrawlingContext]" + }, + { + "type": "reference", + "name": "Awaitable", + "typeArguments": [ + { + "type": "literal", + "value": null + } + ] + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Add a middleware to the pipeline.\n\nThe middleware should yield exactly once, and it should yield an (optionally) extended crawling context object.\nThe part before the yield can be used for initialization and the part after it for cleanup.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2358, + "module": "crawlers._basic._context_pipeline", + "name": "compose", + "parsedDocstring": { + "text": "Add a middleware to the pipeline.\n\nThe middleware should yield exactly once, and it should yield an (optionally) extended crawling context object.\nThe part before the yield can be used for initialization and the part after it for cleanup.\n", + "returns": "The extended pipeline instance, providing a fluent interface" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_context_pipeline.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 102 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "The extended pipeline instance, providing a fluent interface" + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Add a middleware to the pipeline.\n\nThe middleware should yield exactly once, and it should yield an (optionally) extended crawling context object.\nThe part before the yield can be used for initialization and the part after it for cleanup.\n" + } + ] + }, + "flags": {}, + "id": 2359, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "compose", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2360, + "kind": 32768, + "kindString": "Parameter", + "name": "middleware", + "type": { + "name": "Callable", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "[TCrawlingContext]" + }, + { + "type": "reference", + "name": "AsyncGenerator", + "typeArguments": [ + { + "type": "reference", + "name": "TMiddlewareCrawlingContext", + "target": "2348" + }, + { + "type": "literal", + "value": null + } + ] + } + ] + } + } + ], + "type": { + "name": "ContextPipeline", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "TMiddlewareCrawlingContext", + "target": "2348" + } + ], + "target": "2349" + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Encapsulates the logic of gradually enhancing the crawling context with additional information and utilities.\n\nThe enhancement is done by a chain of middlewares that are added to the pipeline after it's creation." + } + ] + }, + "decorations": [ + { + "args": "('Classes')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 2354, + 2350, + 2358 + ], + "title": "Methods" + } + ], + "id": 2349, + "module": "crawlers._basic._context_pipeline", + "name": "ContextPipeline", + "parsedDocstring": { + "text": "Encapsulates the logic of gradually enhancing the crawling context with additional information and utilities.\n\nThe enhancement is done by a chain of middlewares that are added to the pipeline after it's creation." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_context_pipeline.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 25 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2361, + "module": "crawlers._basic._basic_crawler", + "name": "TCrawlingContext", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 90 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2362, + "module": "crawlers._basic._basic_crawler", + "name": "TStatisticsState", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 91 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2363, + "module": "crawlers._basic._basic_crawler", + "name": "TRequestIterator", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 92 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2364, + "module": "crawlers._basic._basic_crawler", + "name": "ErrorHandler", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 93 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2365, + "module": "crawlers._basic._basic_crawler", + "name": "FailedRequestHandler", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 94 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2366, + "module": "crawlers._basic._basic_crawler", + "name": "SkippedRequestCallback", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 95 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The `Configuration` instance. Some of its properties are used as defaults for the crawler." + } + ] + }, + "flags": {}, + "groups": [], + "id": 2368, + "module": "crawlers._basic._basic_crawler", + "name": "configuration", + "parsedDocstring": { + "text": "The `Configuration` instance. Some of its properties are used as defaults for the crawler." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 101 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Configuration", + "target": "93" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The event manager for managing events for the crawler and all its components." + } + ] + }, + "flags": {}, + "groups": [], + "id": 2369, + "module": "crawlers._basic._basic_crawler", + "name": "event_manager", + "parsedDocstring": { + "text": "The event manager for managing events for the crawler and all its components." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 104 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "EventManager", + "target": "2086" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The storage client for managing storages for the crawler and all its components." + } + ] + }, + "flags": {}, + "groups": [], + "id": 2370, + "module": "crawlers._basic._basic_crawler", + "name": "storage_client", + "parsedDocstring": { + "text": "The storage client for managing storages for the crawler and all its components." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 107 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "StorageClient", + "target": "1189" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Manager of requests that should be processed by the crawler." + } + ] + }, + "flags": {}, + "groups": [], + "id": 2371, + "module": "crawlers._basic._basic_crawler", + "name": "request_manager", + "parsedDocstring": { + "text": "Manager of requests that should be processed by the crawler." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 110 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "RequestManager", + "target": "1752" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A custom `SessionPool` instance, allowing the use of non-default configuration." + } + ] + }, + "flags": {}, + "groups": [], + "id": 2372, + "module": "crawlers._basic._basic_crawler", + "name": "session_pool", + "parsedDocstring": { + "text": "A custom `SessionPool` instance, allowing the use of non-default configuration." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 113 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "SessionPool", + "target": "1534" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP proxy configuration used when making requests." + } + ] + }, + "flags": {}, + "groups": [], + "id": 2373, + "module": "crawlers._basic._basic_crawler", + "name": "proxy_configuration", + "parsedDocstring": { + "text": "HTTP proxy configuration used when making requests." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 116 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "ProxyConfiguration", + "target": "24" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP client used by `BasicCrawlingContext.send_request` method." + } + ] + }, + "flags": {}, + "groups": [], + "id": 2374, + "module": "crawlers._basic._basic_crawler", + "name": "http_client", + "parsedDocstring": { + "text": "HTTP client used by `BasicCrawlingContext.send_request` method." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 119 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "HttpClient", + "target": "1919" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Specifies the maximum number of retries allowed for a request if its processing fails.\nThis includes retries due to navigation errors or errors thrown from user-supplied functions\n(`request_handler`, `pre_navigation_hooks` etc.).\n\nThis limit does not apply to retries triggered by session rotation (see `max_session_rotations`)." + } + ] + }, + "flags": {}, + "groups": [], + "id": 2375, + "module": "crawlers._basic._basic_crawler", + "name": "max_request_retries", + "parsedDocstring": { + "text": "Specifies the maximum number of retries allowed for a request if its processing fails.\nThis includes retries due to navigation errors or errors thrown from user-supplied functions\n(`request_handler`, `pre_navigation_hooks` etc.).\n\nThis limit does not apply to retries triggered by session rotation (see `max_session_rotations`)." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 122 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum number of pages to open during a crawl. The crawl stops upon reaching this limit.\nSetting this value can help avoid infinite loops in misconfigured crawlers. `None` means no limit.\nDue to concurrency settings, the actual number of pages visited may slightly exceed this value." + } + ] + }, + "flags": {}, + "groups": [], + "id": 2376, + "module": "crawlers._basic._basic_crawler", + "name": "max_requests_per_crawl", + "parsedDocstring": { + "text": "Maximum number of pages to open during a crawl. The crawl stops upon reaching this limit.\nSetting this value can help avoid infinite loops in misconfigured crawlers. `None` means no limit.\nDue to concurrency settings, the actual number of pages visited may slightly exceed this value." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 129 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum number of session rotations per request. The crawler rotates the session if a proxy error occurs\nor if the website blocks the request.\n\nThe session rotations are not counted towards the `max_request_retries` limit." + } + ] + }, + "flags": {}, + "groups": [], + "id": 2377, + "module": "crawlers._basic._basic_crawler", + "name": "max_session_rotations", + "parsedDocstring": { + "text": "Maximum number of session rotations per request. The crawler rotates the session if a proxy error occurs\nor if the website blocks the request.\n\nThe session rotations are not counted towards the `max_request_retries` limit." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 134 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Specifies the maximum crawl depth. If set, the crawler will stop processing links beyond this depth.\nThe crawl depth starts at 0 for initial requests and increases with each subsequent level of links.\nRequests at the maximum depth will still be processed, but no new links will be enqueued from those requests.\nIf not set, crawling continues without depth restrictions." + } + ] + }, + "flags": {}, + "groups": [], + "id": 2378, + "module": "crawlers._basic._basic_crawler", + "name": "max_crawl_depth", + "parsedDocstring": { + "text": "Specifies the maximum crawl depth. If set, the crawler will stop processing links beyond this depth.\nThe crawl depth starts at 0 for initial requests and increases with each subsequent level of links.\nRequests at the maximum depth will still be processed, but no new links will be enqueued from those requests.\nIf not set, crawling continues without depth restrictions." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 141 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Enable the use of a session pool for managing sessions during crawling." + } + ] + }, + "flags": {}, + "groups": [], + "id": 2379, + "module": "crawlers._basic._basic_crawler", + "name": "use_session_pool", + "parsedDocstring": { + "text": "Enable the use of a session pool for managing sessions during crawling." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 148 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, the crawler attempts to bypass bot protections automatically." + } + ] + }, + "flags": {}, + "groups": [], + "id": 2380, + "module": "crawlers._basic._basic_crawler", + "name": "retry_on_blocked", + "parsedDocstring": { + "text": "If True, the crawler attempts to bypass bot protections automatically." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 151 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Settings to fine-tune concurrency levels." + } + ] + }, + "flags": {}, + "groups": [], + "id": 2381, + "module": "crawlers._basic._basic_crawler", + "name": "concurrency_settings", + "parsedDocstring": { + "text": "Settings to fine-tune concurrency levels." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 154 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "ConcurrencySettings", + "target": "146" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum duration allowed for a single request handler to run." + } + ] + }, + "flags": {}, + "groups": [], + "id": 2382, + "module": "crawlers._basic._basic_crawler", + "name": "request_handler_timeout", + "parsedDocstring": { + "text": "Maximum duration allowed for a single request handler to run." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 157 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "timedelta" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, the crawler stops immediately when any request handler error occurs." + } + ] + }, + "flags": {}, + "groups": [], + "id": 2383, + "module": "crawlers._basic._basic_crawler", + "name": "abort_on_error", + "parsedDocstring": { + "text": "If True, the crawler stops immediately when any request handler error occurs." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 160 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, the crawler will set up logging infrastructure automatically." + } + ] + }, + "flags": {}, + "groups": [], + "id": 2384, + "module": "crawlers._basic._basic_crawler", + "name": "configure_logging", + "parsedDocstring": { + "text": "If True, the crawler will set up logging infrastructure automatically." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 163 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If 'table', displays crawler statistics as formatted tables in logs. If 'inline', outputs statistics as plain\ntext log messages." + } + ] + }, + "flags": {}, + "groups": [], + "id": 2385, + "module": "crawlers._basic._basic_crawler", + "name": "statistics_log_format", + "parsedDocstring": { + "text": "If 'table', displays crawler statistics as formatted tables in logs. If 'inline', outputs statistics as plain\ntext log messages." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 166 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Literal", + "typeArguments": [ + { + "type": "literal", + "value": "table" + }, + { + "type": "literal", + "value": "inline" + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Flag that can keep crawler running even when there are no requests in queue." + } + ] + }, + "flags": {}, + "groups": [], + "id": 2386, + "module": "crawlers._basic._basic_crawler", + "name": "keep_alive", + "parsedDocstring": { + "text": "Flag that can keep crawler running even when there are no requests in queue." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 171 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Additional HTTP status codes to treat as errors, triggering automatic retries when encountered." + } + ] + }, + "flags": {}, + "groups": [], + "id": 2387, + "module": "crawlers._basic._basic_crawler", + "name": "additional_http_error_status_codes", + "parsedDocstring": { + "text": "Additional HTTP status codes to treat as errors, triggering automatic retries when encountered." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 174 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Iterable", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP status codes that are typically considered errors but should be treated as successful responses." + } + ] + }, + "flags": {}, + "groups": [], + "id": 2388, + "module": "crawlers._basic._basic_crawler", + "name": "ignore_http_error_status_codes", + "parsedDocstring": { + "text": "HTTP status codes that are typically considered errors but should be treated as successful responses." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 177 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Iterable", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If set to `True`, the crawler will automatically try to fetch the robots.txt file for each domain,\nand skip those that are not allowed. This also prevents disallowed URLs to be added via `EnqueueLinksFunction`." + } + ] + }, + "flags": {}, + "groups": [], + "id": 2389, + "module": "crawlers._basic._basic_crawler", + "name": "respect_robots_txt_file", + "parsedDocstring": { + "text": "If set to `True`, the crawler will automatically try to fetch the robots.txt file for each domain,\nand skip those that are not allowed. This also prevents disallowed URLs to be added via `EnqueueLinksFunction`." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 188 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Non-generic options the `BasicCrawler` constructor." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 2383, + 2387, + 2381, + 2368, + 2384, + 2369, + 2374, + 2388, + 2386, + 2378, + 2375, + 2376, + 2377, + 2373, + 2382, + 2371, + 2389, + 2380, + 2372, + 2385, + 2370, + 2379 + ], + "title": "Properties" + } + ], + "id": 2367, + "module": "crawlers._basic._basic_crawler", + "name": "_BasicCrawlerOptions", + "parsedDocstring": { + "text": "Non-generic options the `BasicCrawler` constructor." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 98 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedBy": [ + { + "name": "BasicCrawlerOptions", + "target": "2393", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A callable responsible for handling requests." + } + ] + }, + "flags": {}, + "groups": [], + "id": 2391, + "module": "crawlers._basic._basic_crawler", + "name": "request_handler", + "parsedDocstring": { + "text": "A callable responsible for handling requests." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 196 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Callable", + "typeArguments": [ + { + "type": "reference", + "name": "[TCrawlingContext]" + }, + { + "type": "reference", + "name": "Awaitable", + "typeArguments": [ + { + "type": "literal", + "value": null + } + ] + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A custom `Statistics` instance, allowing the use of non-default configuration." + } + ] + }, + "flags": {}, + "groups": [], + "id": 2392, + "module": "crawlers._basic._basic_crawler", + "name": "statistics", + "parsedDocstring": { + "text": "A custom `Statistics` instance, allowing the use of non-default configuration." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 203 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Statistics", + "typeArguments": [ + { + "type": "reference", + "name": "TStatisticsState", + "target": "1392" + } + ], + "target": "1403" + } + ] + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Generic options the `BasicCrawler` constructor." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 2391, + 2392 + ], + "title": "Properties" + } + ], + "id": 2390, + "module": "crawlers._basic._basic_crawler", + "name": "_BasicCrawlerOptionsGeneric", + "parsedDocstring": { + "text": "Generic options the `BasicCrawler` constructor." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 193 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedBy": [ + { + "name": "BasicCrawlerOptions", + "target": "2393", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A callable responsible for handling requests." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3252, + "module": "crawlers._basic._basic_crawler", + "name": "request_handler", + "parsedDocstring": { + "text": "A callable responsible for handling requests." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 196 + } + ], + "type": { + "name": "NotRequired[Callable[[TCrawlingContext], Awaitable[None]]]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptionsGeneric.request_handler", + "target": 2391, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A custom `Statistics` instance, allowing the use of non-default configuration." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3253, + "module": "crawlers._basic._basic_crawler", + "name": "statistics", + "parsedDocstring": { + "text": "A custom `Statistics` instance, allowing the use of non-default configuration." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 203 + } + ], + "type": { + "name": "NotRequired[Statistics[TStatisticsState]]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptionsGeneric.statistics", + "target": 2392, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The `Configuration` instance. Some of its properties are used as defaults for the crawler." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3254, + "module": "crawlers._basic._basic_crawler", + "name": "configuration", + "parsedDocstring": { + "text": "The `Configuration` instance. Some of its properties are used as defaults for the crawler." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 101 + } + ], + "type": { + "name": "NotRequired[Configuration]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.configuration", + "target": 2368, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The event manager for managing events for the crawler and all its components." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3255, + "module": "crawlers._basic._basic_crawler", + "name": "event_manager", + "parsedDocstring": { + "text": "The event manager for managing events for the crawler and all its components." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 104 + } + ], + "type": { + "name": "NotRequired[EventManager]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.event_manager", + "target": 2369, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The storage client for managing storages for the crawler and all its components." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3256, + "module": "crawlers._basic._basic_crawler", + "name": "storage_client", + "parsedDocstring": { + "text": "The storage client for managing storages for the crawler and all its components." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 107 + } + ], + "type": { + "name": "NotRequired[StorageClient]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.storage_client", + "target": 2370, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Manager of requests that should be processed by the crawler." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3257, + "module": "crawlers._basic._basic_crawler", + "name": "request_manager", + "parsedDocstring": { + "text": "Manager of requests that should be processed by the crawler." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 110 + } + ], + "type": { + "name": "NotRequired[RequestManager]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.request_manager", + "target": 2371, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A custom `SessionPool` instance, allowing the use of non-default configuration." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3258, + "module": "crawlers._basic._basic_crawler", + "name": "session_pool", + "parsedDocstring": { + "text": "A custom `SessionPool` instance, allowing the use of non-default configuration." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 113 + } + ], + "type": { + "name": "NotRequired[SessionPool]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.session_pool", + "target": 2372, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP proxy configuration used when making requests." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3259, + "module": "crawlers._basic._basic_crawler", + "name": "proxy_configuration", + "parsedDocstring": { + "text": "HTTP proxy configuration used when making requests." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 116 + } + ], + "type": { + "name": "NotRequired[ProxyConfiguration]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.proxy_configuration", + "target": 2373, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP client used by `BasicCrawlingContext.send_request` method." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3260, + "module": "crawlers._basic._basic_crawler", + "name": "http_client", + "parsedDocstring": { + "text": "HTTP client used by `BasicCrawlingContext.send_request` method." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 119 + } + ], + "type": { + "name": "NotRequired[HttpClient]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.http_client", + "target": 2374, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Specifies the maximum number of retries allowed for a request if its processing fails.\nThis includes retries due to navigation errors or errors thrown from user-supplied functions\n(`request_handler`, `pre_navigation_hooks` etc.).\n\nThis limit does not apply to retries triggered by session rotation (see `max_session_rotations`)." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3261, + "module": "crawlers._basic._basic_crawler", + "name": "max_request_retries", + "parsedDocstring": { + "text": "Specifies the maximum number of retries allowed for a request if its processing fails.\nThis includes retries due to navigation errors or errors thrown from user-supplied functions\n(`request_handler`, `pre_navigation_hooks` etc.).\n\nThis limit does not apply to retries triggered by session rotation (see `max_session_rotations`)." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 122 + } + ], + "type": { + "name": "NotRequired[int]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.max_request_retries", + "target": 2375, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum number of pages to open during a crawl. The crawl stops upon reaching this limit.\nSetting this value can help avoid infinite loops in misconfigured crawlers. `None` means no limit.\nDue to concurrency settings, the actual number of pages visited may slightly exceed this value." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3262, + "module": "crawlers._basic._basic_crawler", + "name": "max_requests_per_crawl", + "parsedDocstring": { + "text": "Maximum number of pages to open during a crawl. The crawl stops upon reaching this limit.\nSetting this value can help avoid infinite loops in misconfigured crawlers. `None` means no limit.\nDue to concurrency settings, the actual number of pages visited may slightly exceed this value." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 129 + } + ], + "type": { + "name": "NotRequired[int | None]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.max_requests_per_crawl", + "target": 2376, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum number of session rotations per request. The crawler rotates the session if a proxy error occurs\nor if the website blocks the request.\n\nThe session rotations are not counted towards the `max_request_retries` limit." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3263, + "module": "crawlers._basic._basic_crawler", + "name": "max_session_rotations", + "parsedDocstring": { + "text": "Maximum number of session rotations per request. The crawler rotates the session if a proxy error occurs\nor if the website blocks the request.\n\nThe session rotations are not counted towards the `max_request_retries` limit." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 134 + } + ], + "type": { + "name": "NotRequired[int]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.max_session_rotations", + "target": 2377, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Specifies the maximum crawl depth. If set, the crawler will stop processing links beyond this depth.\nThe crawl depth starts at 0 for initial requests and increases with each subsequent level of links.\nRequests at the maximum depth will still be processed, but no new links will be enqueued from those requests.\nIf not set, crawling continues without depth restrictions." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3264, + "module": "crawlers._basic._basic_crawler", + "name": "max_crawl_depth", + "parsedDocstring": { + "text": "Specifies the maximum crawl depth. If set, the crawler will stop processing links beyond this depth.\nThe crawl depth starts at 0 for initial requests and increases with each subsequent level of links.\nRequests at the maximum depth will still be processed, but no new links will be enqueued from those requests.\nIf not set, crawling continues without depth restrictions." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 141 + } + ], + "type": { + "name": "NotRequired[int | None]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.max_crawl_depth", + "target": 2378, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Enable the use of a session pool for managing sessions during crawling." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3265, + "module": "crawlers._basic._basic_crawler", + "name": "use_session_pool", + "parsedDocstring": { + "text": "Enable the use of a session pool for managing sessions during crawling." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 148 + } + ], + "type": { + "name": "NotRequired[bool]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.use_session_pool", + "target": 2379, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, the crawler attempts to bypass bot protections automatically." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3266, + "module": "crawlers._basic._basic_crawler", + "name": "retry_on_blocked", + "parsedDocstring": { + "text": "If True, the crawler attempts to bypass bot protections automatically." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 151 + } + ], + "type": { + "name": "NotRequired[bool]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.retry_on_blocked", + "target": 2380, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Settings to fine-tune concurrency levels." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3267, + "module": "crawlers._basic._basic_crawler", + "name": "concurrency_settings", + "parsedDocstring": { + "text": "Settings to fine-tune concurrency levels." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 154 + } + ], + "type": { + "name": "NotRequired[ConcurrencySettings]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.concurrency_settings", + "target": 2381, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum duration allowed for a single request handler to run." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3268, + "module": "crawlers._basic._basic_crawler", + "name": "request_handler_timeout", + "parsedDocstring": { + "text": "Maximum duration allowed for a single request handler to run." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 157 + } + ], + "type": { + "name": "NotRequired[timedelta]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.request_handler_timeout", + "target": 2382, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, the crawler stops immediately when any request handler error occurs." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3269, + "module": "crawlers._basic._basic_crawler", + "name": "abort_on_error", + "parsedDocstring": { + "text": "If True, the crawler stops immediately when any request handler error occurs." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 160 + } + ], + "type": { + "name": "NotRequired[bool]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.abort_on_error", + "target": 2383, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, the crawler will set up logging infrastructure automatically." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3270, + "module": "crawlers._basic._basic_crawler", + "name": "configure_logging", + "parsedDocstring": { + "text": "If True, the crawler will set up logging infrastructure automatically." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 163 + } + ], + "type": { + "name": "NotRequired[bool]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.configure_logging", + "target": 2384, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If 'table', displays crawler statistics as formatted tables in logs. If 'inline', outputs statistics as plain\ntext log messages." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3271, + "module": "crawlers._basic._basic_crawler", + "name": "statistics_log_format", + "parsedDocstring": { + "text": "If 'table', displays crawler statistics as formatted tables in logs. If 'inline', outputs statistics as plain\ntext log messages." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 166 + } + ], + "type": { + "name": "NotRequired[Literal['table', 'inline']]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.statistics_log_format", + "target": 2385, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Flag that can keep crawler running even when there are no requests in queue." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3272, + "module": "crawlers._basic._basic_crawler", + "name": "keep_alive", + "parsedDocstring": { + "text": "Flag that can keep crawler running even when there are no requests in queue." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 171 + } + ], + "type": { + "name": "NotRequired[bool]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.keep_alive", + "target": 2386, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Additional HTTP status codes to treat as errors, triggering automatic retries when encountered." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3273, + "module": "crawlers._basic._basic_crawler", + "name": "additional_http_error_status_codes", + "parsedDocstring": { + "text": "Additional HTTP status codes to treat as errors, triggering automatic retries when encountered." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 174 + } + ], + "type": { + "name": "NotRequired[Iterable[int]]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.additional_http_error_status_codes", + "target": 2387, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP status codes that are typically considered errors but should be treated as successful responses." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3274, + "module": "crawlers._basic._basic_crawler", + "name": "ignore_http_error_status_codes", + "parsedDocstring": { + "text": "HTTP status codes that are typically considered errors but should be treated as successful responses." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 177 + } + ], + "type": { + "name": "NotRequired[Iterable[int]]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.ignore_http_error_status_codes", + "target": 2388, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If set to `True`, the crawler will automatically try to fetch the robots.txt file for each domain,\nand skip those that are not allowed. This also prevents disallowed URLs to be added via `EnqueueLinksFunction`." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3275, + "module": "crawlers._basic._basic_crawler", + "name": "respect_robots_txt_file", + "parsedDocstring": { + "text": "If set to `True`, the crawler will automatically try to fetch the robots.txt file for each domain,\nand skip those that are not allowed. This also prevents disallowed URLs to be added via `EnqueueLinksFunction`." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 188 + } + ], + "type": { + "name": "NotRequired[bool]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.respect_robots_txt_file", + "target": 2389, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Arguments for the `BasicCrawler` constructor.\n\nIt is intended for typing forwarded `__init__` arguments in the subclasses." + } + ] + }, + "decorations": [ + { + "args": "('Data structures')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 3269, + 3273, + 3267, + 3254, + 3270, + 3255, + 3260, + 3274, + 3272, + 3264, + 3261, + 3262, + 3263, + 3259, + 3252, + 3268, + 3257, + 3275, + 3266, + 3258, + 3253, + 3271, + 3256, + 3265 + ], + "title": "Properties" + } + ], + "id": 2393, + "module": "crawlers._basic._basic_crawler", + "name": "BasicCrawlerOptions", + "parsedDocstring": { + "text": "Arguments for the `BasicCrawler` constructor.\n\nIt is intended for typing forwarded `__init__` arguments in the subclasses." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 208 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "_BasicCrawlerOptionsGeneric", + "target": "2390", + "type": "reference" + }, + { + "name": "_BasicCrawlerOptions", + "target": "2367", + "type": "reference" + } + ], + "extendedBy": [ + { + "name": "PlaywrightCrawlerOptions", + "target": "2243", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2395, + "module": "crawlers._basic._basic_crawler", + "name": "__init__", + "parsedDocstring": { + "text": "Initialize a new instance.\n", + "args": { + "configuration": "The `Configuration` instance. Some of its properties are used as defaults for the crawler.", + "event_manager": "The event manager for managing events for the crawler and all its components.", + "storage_client": "The storage client for managing storages for the crawler and all its components.", + "request_manager": "Manager of requests that should be processed by the crawler.", + "session_pool": "A custom `SessionPool` instance, allowing the use of non-default configuration.", + "proxy_configuration": "HTTP proxy configuration used when making requests.", + "http_client": "HTTP client used by `BasicCrawlingContext.send_request` method.", + "request_handler": "A callable responsible for handling requests.", + "max_request_retries": "Specifies the maximum number of retries allowed for a request if its processing fails.\nThis includes retries due to navigation errors or errors thrown from user-supplied functions\n(`request_handler`, `pre_navigation_hooks` etc.).\n\nThis limit does not apply to retries triggered by session rotation (see `max_session_rotations`).", + "max_requests_per_crawl": "Maximum number of pages to open during a crawl. The crawl stops upon reaching\nthis limit. Setting this value can help avoid infinite loops in misconfigured crawlers. `None` means\nno limit. Due to concurrency settings, the actual number of pages visited may slightly exceed\nthis value. If used together with `keep_alive`, then the crawler will be kept alive only until\n`max_requests_per_crawl` is achieved.", + "max_session_rotations": "Maximum number of session rotations per request. The crawler rotates the session\nif a proxy error occurs or if the website blocks the request.\n\nThe session rotations are not counted towards the `max_request_retries` limit.", + "max_crawl_depth": "Specifies the maximum crawl depth. If set, the crawler will stop processing links beyond\nthis depth. The crawl depth starts at 0 for initial requests and increases with each subsequent level\nof links. Requests at the maximum depth will still be processed, but no new links will be enqueued\nfrom those requests. If not set, crawling continues without depth restrictions.", + "use_session_pool": "Enable the use of a session pool for managing sessions during crawling.", + "retry_on_blocked": "If True, the crawler attempts to bypass bot protections automatically.", + "additional_http_error_status_codes": "Additional HTTP status codes to treat as errors,\ntriggering automatic retries when encountered.", + "ignore_http_error_status_codes": "HTTP status codes that are typically considered errors but should be treated\nas successful responses.", + "concurrency_settings": "Settings to fine-tune concurrency levels.", + "request_handler_timeout": "Maximum duration allowed for a single request handler to run.", + "statistics": "A custom `Statistics` instance, allowing the use of non-default configuration.", + "abort_on_error": "If True, the crawler stops immediately when any request handler error occurs.", + "keep_alive": "If True, it will keep crawler alive even if there are no requests in queue.\nUse `crawler.stop()` to exit the crawler.", + "configure_logging": "If True, the crawler will set up logging infrastructure automatically.", + "statistics_log_format": "If 'table', displays crawler statistics as formatted tables in logs. If 'inline',\noutputs statistics as plain text log messages.", + "respect_robots_txt_file": "If set to `True`, the crawler will automatically try to fetch the robots.txt file\nfor each domain, and skip those that are not allowed. This also prevents disallowed URLs to be added\nvia `EnqueueLinksFunction`", + "_context_pipeline": "Enables extending the request lifecycle and modifying the crawling context.\nIntended for use by subclasses rather than direct instantiation of `BasicCrawler`.", + "_additional_context_managers": "Additional context managers used throughout the crawler lifecycle.\nIntended for use by subclasses rather than direct instantiation of `BasicCrawler`.", + "_logger": "A logger instance, typically provided by a subclass, for consistent logging labels.\nIntended for use by subclasses rather than direct instantiation of `BasicCrawler`." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 247 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n" + } + ] + }, + "flags": {}, + "id": 2396, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The `Configuration` instance. Some of its properties are used as defaults for the crawler." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2397, + "kind": 32768, + "kindString": "Parameter", + "name": "configuration", + "type": { + "name": "Configuration | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Configuration", + "target": "93" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The event manager for managing events for the crawler and all its components." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2398, + "kind": 32768, + "kindString": "Parameter", + "name": "event_manager", + "type": { + "name": "EventManager | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "EventManager", + "target": "2086" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The storage client for managing storages for the crawler and all its components." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2399, + "kind": 32768, + "kindString": "Parameter", + "name": "storage_client", + "type": { + "name": "StorageClient | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "StorageClient", + "target": "1189" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Manager of requests that should be processed by the crawler." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2400, + "kind": 32768, + "kindString": "Parameter", + "name": "request_manager", + "type": { + "name": "RequestManager | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "RequestManager", + "target": "1752" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A custom `SessionPool` instance, allowing the use of non-default configuration." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2401, + "kind": 32768, + "kindString": "Parameter", + "name": "session_pool", + "type": { + "name": "SessionPool | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "SessionPool", + "target": "1534" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP proxy configuration used when making requests." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2402, + "kind": 32768, + "kindString": "Parameter", + "name": "proxy_configuration", + "type": { + "name": "ProxyConfiguration | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ProxyConfiguration", + "target": "24" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP client used by `BasicCrawlingContext.send_request` method." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2403, + "kind": 32768, + "kindString": "Parameter", + "name": "http_client", + "type": { + "name": "HttpClient | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "HttpClient", + "target": "1919" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A callable responsible for handling requests." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2404, + "kind": 32768, + "kindString": "Parameter", + "name": "request_handler", + "type": { + "name": "Callable[[TCrawlingContext], Awaitable[None]] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Callable", + "typeArguments": [ + { + "type": "reference", + "name": "[TCrawlingContext]" + }, + { + "type": "reference", + "name": "Awaitable", + "typeArguments": [ + { + "type": "literal", + "value": null + } + ] + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Specifies the maximum number of retries allowed for a request if its processing fails.\nThis includes retries due to navigation errors or errors thrown from user-supplied functions\n(`request_handler`, `pre_navigation_hooks` etc.).\n\nThis limit does not apply to retries triggered by session rotation (see `max_session_rotations`)." + } + ] + }, + "defaultValue": "3", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2405, + "kind": 32768, + "kindString": "Parameter", + "name": "max_request_retries", + "type": { + "name": "int", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum number of pages to open during a crawl. The crawl stops upon reaching\nthis limit. Setting this value can help avoid infinite loops in misconfigured crawlers. `None` means\nno limit. Due to concurrency settings, the actual number of pages visited may slightly exceed\nthis value. If used together with `keep_alive`, then the crawler will be kept alive only until\n`max_requests_per_crawl` is achieved." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2406, + "kind": 32768, + "kindString": "Parameter", + "name": "max_requests_per_crawl", + "type": { + "name": "int | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum number of session rotations per request. The crawler rotates the session\nif a proxy error occurs or if the website blocks the request.\n\nThe session rotations are not counted towards the `max_request_retries` limit." + } + ] + }, + "defaultValue": "10", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2407, + "kind": 32768, + "kindString": "Parameter", + "name": "max_session_rotations", + "type": { + "name": "int", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Specifies the maximum crawl depth. If set, the crawler will stop processing links beyond\nthis depth. The crawl depth starts at 0 for initial requests and increases with each subsequent level\nof links. Requests at the maximum depth will still be processed, but no new links will be enqueued\nfrom those requests. If not set, crawling continues without depth restrictions." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2408, + "kind": 32768, + "kindString": "Parameter", + "name": "max_crawl_depth", + "type": { + "name": "int | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Enable the use of a session pool for managing sessions during crawling." + } + ] + }, + "defaultValue": "True", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2409, + "kind": 32768, + "kindString": "Parameter", + "name": "use_session_pool", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, the crawler attempts to bypass bot protections automatically." + } + ] + }, + "defaultValue": "True", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2410, + "kind": 32768, + "kindString": "Parameter", + "name": "retry_on_blocked", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Additional HTTP status codes to treat as errors,\ntriggering automatic retries when encountered." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2411, + "kind": 32768, + "kindString": "Parameter", + "name": "additional_http_error_status_codes", + "type": { + "name": "Iterable[int] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Iterable", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP status codes that are typically considered errors but should be treated\nas successful responses." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2412, + "kind": 32768, + "kindString": "Parameter", + "name": "ignore_http_error_status_codes", + "type": { + "name": "Iterable[int] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Iterable", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Settings to fine-tune concurrency levels." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2413, + "kind": 32768, + "kindString": "Parameter", + "name": "concurrency_settings", + "type": { + "name": "ConcurrencySettings | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ConcurrencySettings", + "target": "146" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum duration allowed for a single request handler to run." + } + ] + }, + "defaultValue": "timedelta(minutes=1)", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2414, + "kind": 32768, + "kindString": "Parameter", + "name": "request_handler_timeout", + "type": { + "name": "timedelta", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A custom `Statistics` instance, allowing the use of non-default configuration." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2415, + "kind": 32768, + "kindString": "Parameter", + "name": "statistics", + "type": { + "name": "Statistics[TStatisticsState] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Statistics", + "typeArguments": [ + { + "type": "reference", + "name": "TStatisticsState", + "target": "1392" + } + ], + "target": "1403" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, the crawler stops immediately when any request handler error occurs." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2416, + "kind": 32768, + "kindString": "Parameter", + "name": "abort_on_error", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, it will keep crawler alive even if there are no requests in queue.\nUse `crawler.stop()` to exit the crawler." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2417, + "kind": 32768, + "kindString": "Parameter", + "name": "keep_alive", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, the crawler will set up logging infrastructure automatically." + } + ] + }, + "defaultValue": "True", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2418, + "kind": 32768, + "kindString": "Parameter", + "name": "configure_logging", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "If 'table', displays crawler statistics as formatted tables in logs. If 'inline',\noutputs statistics as plain text log messages." + } + ] + }, + "defaultValue": "'table'", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2419, + "kind": 32768, + "kindString": "Parameter", + "name": "statistics_log_format", + "type": { + "name": "Literal", + "type": "reference", + "typeArguments": [ + { + "type": "literal", + "value": "table" + }, + { + "type": "literal", + "value": "inline" + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "If set to `True`, the crawler will automatically try to fetch the robots.txt file\nfor each domain, and skip those that are not allowed. This also prevents disallowed URLs to be added\nvia `EnqueueLinksFunction`" + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2420, + "kind": 32768, + "kindString": "Parameter", + "name": "respect_robots_txt_file", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Enables extending the request lifecycle and modifying the crawling context.\nIntended for use by subclasses rather than direct instantiation of `BasicCrawler`." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2421, + "kind": 32768, + "kindString": "Parameter", + "name": "_context_pipeline", + "type": { + "name": "ContextPipeline[TCrawlingContext] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ContextPipeline", + "typeArguments": [ + { + "type": "reference", + "name": "TCrawlingContext", + "target": "1" + } + ], + "target": "2349" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Additional context managers used throughout the crawler lifecycle.\nIntended for use by subclasses rather than direct instantiation of `BasicCrawler`." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2422, + "kind": 32768, + "kindString": "Parameter", + "name": "_additional_context_managers", + "type": { + "name": "Sequence[AbstractAsyncContextManager] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Sequence", + "typeArguments": [ + { + "type": "reference", + "name": "AbstractAsyncContextManager" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A logger instance, typically provided by a subclass, for consistent logging labels.\nIntended for use by subclasses rather than direct instantiation of `BasicCrawler`." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2423, + "kind": 32768, + "kindString": "Parameter", + "name": "_logger", + "type": { + "name": "logging.Logger | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "logging.Logger" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "BasicCrawler.__init__", + "target": 2395, + "type": "reference" + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The logger used by the crawler." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 2424, + "module": "crawlers._basic._basic_crawler", + "name": "log", + "parsedDocstring": { + "text": "The logger used by the crawler." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 440 + } + ], + "type": { + "name": "logging.Logger", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The `Router` used to handle each individual crawling request." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 2425, + "module": "crawlers._basic._basic_crawler", + "name": "router", + "parsedDocstring": { + "text": "The `Router` used to handle each individual crawling request." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 445 + } + ], + "type": { + "name": "Router", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "TCrawlingContext", + "target": "1" + } + ], + "target": "3" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "args": ".setter", + "name": "router" + } + ], + "flags": {}, + "groups": [], + "id": 2426, + "module": "crawlers._basic._basic_crawler", + "name": "router", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 453 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 2427, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "router", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2428, + "kind": 32768, + "kindString": "Parameter", + "name": "router", + "type": { + "name": "Router", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "TCrawlingContext", + "target": "1" + } + ], + "target": "3" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Statistics about the current (or last) crawler run." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 2429, + "module": "crawlers._basic._basic_crawler", + "name": "statistics", + "parsedDocstring": { + "text": "Statistics about the current (or last) crawler run." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 460 + } + ], + "type": { + "name": "Statistics", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "TStatisticsState", + "target": "1392" + } + ], + "target": "1403" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Set flag to stop crawler.\n\nThis stops current crawler run regardless of whether all requests were finished.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2430, + "module": "crawlers._basic._basic_crawler", + "name": "stop", + "parsedDocstring": { + "text": "Set flag to stop crawler.\n\nThis stops current crawler run regardless of whether all requests were finished.\n", + "args": { + "reason": "Reason for stopping that will be used in logs." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 464 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Set flag to stop crawler.\n\nThis stops current crawler run regardless of whether all requests were finished.\n" + } + ] + }, + "flags": {}, + "id": 2431, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "stop", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Reason for stopping that will be used in logs." + } + ] + }, + "defaultValue": "'Stop was called externally.'", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 2432, + "kind": 32768, + "kindString": "Parameter", + "name": "reason", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the configured request manager. If none is configured, open and return the default request queue." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2433, + "module": "crawlers._basic._basic_crawler", + "name": "get_request_manager", + "parsedDocstring": { + "text": "Return the configured request manager. If none is configured, open and return the default request queue." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 524 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the configured request manager. If none is configured, open and return the default request queue." + } + ] + }, + "flags": {}, + "id": 2434, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_request_manager", + "parameters": [], + "type": { + "name": "RequestManager", + "type": "reference", + "target": "1752" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the `Dataset` with the given ID or name. If none is provided, return the default one." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2435, + "module": "crawlers._basic._basic_crawler", + "name": "get_dataset", + "parsedDocstring": { + "text": "Return the `Dataset` with the given ID or name. If none is provided, return the default one." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 531 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the `Dataset` with the given ID or name. If none is provided, return the default one." + } + ] + }, + "flags": {}, + "id": 2436, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_dataset", + "parameters": [ + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2437, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2438, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "Dataset", + "type": "reference", + "target": "662" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the `KeyValueStore` with the given ID or name. If none is provided, return the default KVS." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2439, + "module": "crawlers._basic._basic_crawler", + "name": "get_key_value_store", + "parsedDocstring": { + "text": "Return the `KeyValueStore` with the given ID or name. If none is provided, return the default KVS." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 540 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the `KeyValueStore` with the given ID or name. If none is provided, return the default KVS." + } + ] + }, + "flags": {}, + "id": 2440, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_key_value_store", + "parameters": [ + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2441, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2442, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "KeyValueStore", + "type": "reference", + "target": "569" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a function to handle errors occurring in request handlers.\n\nThe error handler is invoked after a request handler error occurs and before a retry attempt." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2443, + "module": "crawlers._basic._basic_crawler", + "name": "error_handler", + "parsedDocstring": { + "text": "Register a function to handle errors occurring in request handlers.\n\nThe error handler is invoked after a request handler error occurs and before a retry attempt." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 549 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a function to handle errors occurring in request handlers.\n\nThe error handler is invoked after a request handler error occurs and before a retry attempt." + } + ] + }, + "flags": {}, + "id": 2444, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "error_handler", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2445, + "kind": 32768, + "kindString": "Parameter", + "name": "handler", + "type": { + "name": "ErrorHandler", + "type": "reference", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "TCrawlingContext", + "target": "1" + }, + { + "type": "reference", + "name": "BasicCrawlingContext", + "target": "309" + } + ] + } + ], + "target": "2364" + } + } + ], + "type": { + "name": "ErrorHandler", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "TCrawlingContext", + "target": "1" + } + ], + "target": "2364" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a function to handle requests that exceed the maximum retry limit.\n\nThe failed request handler is invoked when a request has failed all retry attempts." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2446, + "module": "crawlers._basic._basic_crawler", + "name": "failed_request_handler", + "parsedDocstring": { + "text": "Register a function to handle requests that exceed the maximum retry limit.\n\nThe failed request handler is invoked when a request has failed all retry attempts." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 559 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a function to handle requests that exceed the maximum retry limit.\n\nThe failed request handler is invoked when a request has failed all retry attempts." + } + ] + }, + "flags": {}, + "id": 2447, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "failed_request_handler", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2448, + "kind": 32768, + "kindString": "Parameter", + "name": "handler", + "type": { + "name": "FailedRequestHandler", + "type": "reference", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "TCrawlingContext", + "target": "1" + }, + { + "type": "reference", + "name": "BasicCrawlingContext", + "target": "309" + } + ] + } + ], + "target": "2365" + } + } + ], + "type": { + "name": "FailedRequestHandler", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "TCrawlingContext", + "target": "1" + } + ], + "target": "2365" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a function to handle skipped requests.\n\nThe skipped request handler is invoked when a request is skipped due to a collision or other reasons." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2449, + "module": "crawlers._basic._basic_crawler", + "name": "on_skipped_request", + "parsedDocstring": { + "text": "Register a function to handle skipped requests.\n\nThe skipped request handler is invoked when a request is skipped due to a collision or other reasons." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 569 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a function to handle skipped requests.\n\nThe skipped request handler is invoked when a request is skipped due to a collision or other reasons." + } + ] + }, + "flags": {}, + "id": 2450, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "on_skipped_request", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2451, + "kind": 32768, + "kindString": "Parameter", + "name": "callback", + "type": { + "name": "SkippedRequestCallback", + "type": "reference", + "target": "2366" + } + } + ], + "type": { + "name": "SkippedRequestCallback", + "type": "reference", + "target": "2366" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Run the crawler until all requests are processed.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2452, + "module": "crawlers._basic._basic_crawler", + "name": "run", + "parsedDocstring": { + "text": "Run the crawler until all requests are processed.\n", + "args": { + "requests": "The requests to be enqueued before the crawler starts.", + "purge_request_queue": "If this is `True` and the crawler is not being run for the first time, the default\nrequest queue will be purged." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 577 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Run the crawler until all requests are processed.\n" + } + ] + }, + "flags": {}, + "id": 2453, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "run", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The requests to be enqueued before the crawler starts." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 2454, + "kind": 32768, + "kindString": "Parameter", + "name": "requests", + "type": { + "name": "Sequence[str | Request] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Sequence", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Request", + "target": "398" + } + ] + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "If this is `True` and the crawler is not being run for the first time, the default\nrequest queue will be purged." + } + ] + }, + "defaultValue": "True", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2455, + "kind": 32768, + "kindString": "Parameter", + "name": "purge_request_queue", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "FinalStatistics", + "type": "reference", + "target": "1453" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Add requests to the underlying request manager in batches.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2456, + "module": "crawlers._basic._basic_crawler", + "name": "add_requests", + "parsedDocstring": { + "text": "Add requests to the underlying request manager in batches.\n", + "args": { + "requests": "A list of requests to add to the queue.", + "batch_size": "The number of requests to add in one batch.", + "wait_time_between_batches": "Time to wait between adding batches.", + "wait_for_all_requests_to_be_added": "If True, wait for all requests to be added before returning.", + "wait_for_all_requests_to_be_added_timeout": "Timeout for waiting for all requests to be added." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 685 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Add requests to the underlying request manager in batches.\n" + } + ] + }, + "flags": {}, + "id": 2457, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "add_requests", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A list of requests to add to the queue." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2458, + "kind": 32768, + "kindString": "Parameter", + "name": "requests", + "type": { + "name": "Sequence", + "type": "reference", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Request", + "target": "398" + } + ] + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The number of requests to add in one batch." + } + ] + }, + "defaultValue": "1000", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2459, + "kind": 32768, + "kindString": "Parameter", + "name": "batch_size", + "type": { + "name": "int", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Time to wait between adding batches." + } + ] + }, + "defaultValue": "timedelta(0)", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2460, + "kind": 32768, + "kindString": "Parameter", + "name": "wait_time_between_batches", + "type": { + "name": "timedelta", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, wait for all requests to be added before returning." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2461, + "kind": 32768, + "kindString": "Parameter", + "name": "wait_for_all_requests_to_be_added", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Timeout for waiting for all requests to be added." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2462, + "kind": 32768, + "kindString": "Parameter", + "name": "wait_for_all_requests_to_be_added_timeout", + "type": { + "name": "timedelta | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "timedelta" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Retrieve data from a `Dataset`.\n\nThis helper method simplifies the process of retrieving data from a `Dataset`. It opens the specified\none and then retrieves the data based on the provided parameters.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2463, + "module": "crawlers._basic._basic_crawler", + "name": "get_data", + "parsedDocstring": { + "text": "Retrieve data from a `Dataset`.\n\nThis helper method simplifies the process of retrieving data from a `Dataset`. It opens the specified\none and then retrieves the data based on the provided parameters.\n", + "args": { + "dataset_id": "The ID of the `Dataset`.", + "dataset_name": "The name of the `Dataset`.", + "kwargs": "Keyword arguments to be passed to the `Dataset.get_data()` method.\n" + }, + "returns": "The retrieved data." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 738 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "The retrieved data." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Retrieve data from a `Dataset`.\n\nThis helper method simplifies the process of retrieving data from a `Dataset`. It opens the specified\none and then retrieves the data based on the provided parameters.\n" + } + ] + }, + "flags": {}, + "id": 2464, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_data", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The ID of the `Dataset`." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 2465, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_id", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The name of the `Dataset`." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 2466, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Skip the specified number of items at the start." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 626, + "module": "storages._dataset", + "name": "offset", + "parsedDocstring": { + "text": "Skip the specified number of items at the start." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 35 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The maximum number of items to retrieve. Unlimited if None." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 627, + "module": "storages._dataset", + "name": "limit", + "parsedDocstring": { + "text": "The maximum number of items to retrieve. Unlimited if None." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 38 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return only non-empty items and excludes hidden fields. Shortcut for skip_hidden and skip_empty." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 628, + "module": "storages._dataset", + "name": "clean", + "parsedDocstring": { + "text": "Return only non-empty items and excludes hidden fields. Shortcut for skip_hidden and skip_empty." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 41 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Set to True to sort results in descending order." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 629, + "module": "storages._dataset", + "name": "desc", + "parsedDocstring": { + "text": "Set to True to sort results in descending order." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 44 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Fields to include in each item. Sorts fields as specified if provided." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 630, + "module": "storages._dataset", + "name": "fields", + "parsedDocstring": { + "text": "Fields to include in each item. Sorts fields as specified if provided." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 47 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ], + "target": "866" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Fields to exclude from each item." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 631, + "module": "storages._dataset", + "name": "omit", + "parsedDocstring": { + "text": "Fields to exclude from each item." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 50 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ], + "target": "866" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Unwind items by a specified array field, turning each element into a separate item." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 632, + "module": "storages._dataset", + "name": "unwind", + "parsedDocstring": { + "text": "Unwind items by a specified array field, turning each element into a separate item." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 53 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Exclude empty items from the results if True." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 633, + "module": "storages._dataset", + "name": "skip_empty", + "parsedDocstring": { + "text": "Exclude empty items from the results if True." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 56 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Exclude fields starting with '#' if True." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 634, + "module": "storages._dataset", + "name": "skip_hidden", + "parsedDocstring": { + "text": "Exclude fields starting with '#' if True." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 59 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Field to be flattened in returned items." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 635, + "module": "storages._dataset", + "name": "flatten", + "parsedDocstring": { + "text": "Field to be flattened in returned items." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 62 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ], + "target": "866" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Specify the dataset view to be used." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 636, + "module": "storages._dataset", + "name": "view", + "parsedDocstring": { + "text": "Specify the dataset view to be used." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 65 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + } + ], + "type": { + "name": "DatasetItemsListPage", + "type": "reference", + "target": "823" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Export data from a `Dataset`.\n\nThis helper method simplifies the process of exporting data from a `Dataset`. It opens the specified\none and then exports the data based on the provided parameters. If you need to pass options\nspecific to the output format, use the `export_data_csv` or `export_data_json` method instead.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2468, + "module": "crawlers._basic._basic_crawler", + "name": "export_data", + "parsedDocstring": { + "text": "Export data from a `Dataset`.\n\nThis helper method simplifies the process of exporting data from a `Dataset`. It opens the specified\none and then exports the data based on the provided parameters. If you need to pass options\nspecific to the output format, use the `export_data_csv` or `export_data_json` method instead.\n", + "args": { + "path": "The destination path.", + "dataset_id": "The ID of the `Dataset`.", + "dataset_name": "The name of the `Dataset`." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 760 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Export data from a `Dataset`.\n\nThis helper method simplifies the process of exporting data from a `Dataset`. It opens the specified\none and then exports the data based on the provided parameters. If you need to pass options\nspecific to the output format, use the `export_data_csv` or `export_data_json` method instead.\n" + } + ] + }, + "flags": {}, + "id": 2469, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "export_data", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The destination path." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2470, + "kind": 32768, + "kindString": "Parameter", + "name": "path", + "type": { + "name": "str | Path", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Path" + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The ID of the `Dataset`." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 2471, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_id", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The name of the `Dataset`." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 2472, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Export data from a `Dataset` to a CSV file.\n\nThis helper method simplifies the process of exporting data from a `Dataset` in csv format. It opens\nthe specified one and then exports the data based on the provided parameters.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2473, + "module": "crawlers._basic._basic_crawler", + "name": "export_data_csv", + "parsedDocstring": { + "text": "Export data from a `Dataset` to a CSV file.\n\nThis helper method simplifies the process of exporting data from a `Dataset` in csv format. It opens\nthe specified one and then exports the data based on the provided parameters.\n", + "args": { + "path": "The destination path.", + "content_type": "The output format.", + "dataset_id": "The ID of the `Dataset`.", + "dataset_name": "The name of the `Dataset`.", + "kwargs": "Extra configurations for dumping/writing in csv format." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 789 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Export data from a `Dataset` to a CSV file.\n\nThis helper method simplifies the process of exporting data from a `Dataset` in csv format. It opens\nthe specified one and then exports the data based on the provided parameters.\n" + } + ] + }, + "flags": {}, + "id": 2474, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "export_data_csv", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The destination path." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2475, + "kind": 32768, + "kindString": "Parameter", + "name": "path", + "type": { + "name": "str | Path", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Path" + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The ID of the `Dataset`." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2476, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_id", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The name of the `Dataset`." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2477, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Specifies a dialect to be used in CSV parsing and writing." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 653, + "module": "storages._dataset", + "name": "dialect", + "parsedDocstring": { + "text": "Specifies a dialect to be used in CSV parsing and writing." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 124 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A one-character string used to separate fields. Defaults to ','." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 654, + "module": "storages._dataset", + "name": "delimiter", + "parsedDocstring": { + "text": "A one-character string used to separate fields. Defaults to ','." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 127 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Controls how instances of `quotechar` inside a field should be quoted. When True, the character is doubled;\nwhen False, the `escapechar` is used as a prefix. Defaults to True." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 655, + "module": "storages._dataset", + "name": "doublequote", + "parsedDocstring": { + "text": "Controls how instances of `quotechar` inside a field should be quoted. When True, the character is doubled;\nwhen False, the `escapechar` is used as a prefix. Defaults to True." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 130 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A one-character string used to escape the delimiter if `quoting` is set to `QUOTE_NONE` and the `quotechar`\nif `doublequote` is False. Defaults to None, disabling escaping." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 656, + "module": "storages._dataset", + "name": "escapechar", + "parsedDocstring": { + "text": "A one-character string used to escape the delimiter if `quoting` is set to `QUOTE_NONE` and the `quotechar`\nif `doublequote` is False. Defaults to None, disabling escaping." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 134 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The string used to terminate lines produced by the writer. Defaults to '\\r\\n'." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 657, + "module": "storages._dataset", + "name": "lineterminator", + "parsedDocstring": { + "text": "The string used to terminate lines produced by the writer. Defaults to '\\r\\n'." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 138 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A one-character string used to quote fields containing special characters, like the delimiter or quotechar,\nor fields containing new-line characters. Defaults to '\"'." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 658, + "module": "storages._dataset", + "name": "quotechar", + "parsedDocstring": { + "text": "A one-character string used to quote fields containing special characters, like the delimiter or quotechar,\nor fields containing new-line characters. Defaults to '\"'." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 141 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Controls when quotes should be generated by the writer and recognized by the reader. Can take any of\nthe `QUOTE_*` constants, with a default of `QUOTE_MINIMAL`." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 659, + "module": "storages._dataset", + "name": "quoting", + "parsedDocstring": { + "text": "Controls when quotes should be generated by the writer and recognized by the reader. Can take any of\nthe `QUOTE_*` constants, with a default of `QUOTE_MINIMAL`." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 145 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "When True, spaces immediately following the delimiter are ignored. Defaults to False." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 660, + "module": "storages._dataset", + "name": "skipinitialspace", + "parsedDocstring": { + "text": "When True, spaces immediately following the delimiter are ignored. Defaults to False." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 149 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "When True, raises an exception on bad CSV input. Defaults to False." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 661, + "module": "storages._dataset", + "name": "strict", + "parsedDocstring": { + "text": "When True, raises an exception on bad CSV input. Defaults to False." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 152 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Export data from a `Dataset` to a JSON file.\n\nThis helper method simplifies the process of exporting data from a `Dataset` in json format. It opens the\nspecified one and then exports the data based on the provided parameters.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2479, + "module": "crawlers._basic._basic_crawler", + "name": "export_data_json", + "parsedDocstring": { + "text": "Export data from a `Dataset` to a JSON file.\n\nThis helper method simplifies the process of exporting data from a `Dataset` in json format. It opens the\nspecified one and then exports the data based on the provided parameters.\n", + "args": { + "path": "The destination path", + "dataset_id": "The ID of the `Dataset`.", + "dataset_name": "The name of the `Dataset`.", + "kwargs": "Extra configurations for dumping/writing in json format." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 814 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Export data from a `Dataset` to a JSON file.\n\nThis helper method simplifies the process of exporting data from a `Dataset` in json format. It opens the\nspecified one and then exports the data based on the provided parameters.\n" + } + ] + }, + "flags": {}, + "id": 2480, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "export_data_json", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The destination path" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2481, + "kind": 32768, + "kindString": "Parameter", + "name": "path", + "type": { + "name": "str | Path", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Path" + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The ID of the `Dataset`." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2482, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_id", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The name of the `Dataset`." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2483, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True (default: False), dict keys that are not of a basic type (str, int, float, bool, None) will be skipped\ninstead of raising a `TypeError`." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 643, + "module": "storages._dataset", + "name": "skipkeys", + "parsedDocstring": { + "text": "If True (default: False), dict keys that are not of a basic type (str, int, float, bool, None) will be skipped\ninstead of raising a `TypeError`." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 88 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Determines if non-ASCII characters should be escaped in the output JSON string." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 644, + "module": "storages._dataset", + "name": "ensure_ascii", + "parsedDocstring": { + "text": "Determines if non-ASCII characters should be escaped in the output JSON string." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 92 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If False (default: True), skips the circular reference check for container types. A circular reference will\nresult in a `RecursionError` or worse if unchecked." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 645, + "module": "storages._dataset", + "name": "check_circular", + "parsedDocstring": { + "text": "If False (default: True), skips the circular reference check for container types. A circular reference will\nresult in a `RecursionError` or worse if unchecked." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 95 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If False (default: True), raises a ValueError for out-of-range float values (nan, inf, -inf) to strictly comply\nwith the JSON specification. If True, uses their JavaScript equivalents (NaN, Infinity, -Infinity)." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 646, + "module": "storages._dataset", + "name": "allow_nan", + "parsedDocstring": { + "text": "If False (default: True), raises a ValueError for out-of-range float values (nan, inf, -inf) to strictly comply\nwith the JSON specification. If True, uses their JavaScript equivalents (NaN, Infinity, -Infinity)." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 99 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Allows specifying a custom JSON encoder." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 647, + "module": "storages._dataset", + "name": "cls", + "parsedDocstring": { + "text": "Allows specifying a custom JSON encoder." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 103 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "type", + "typeArguments": [ + { + "type": "reference", + "name": "json.JSONEncoder" + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Specifies the number of spaces for indentation in the pretty-printed JSON output." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 648, + "module": "storages._dataset", + "name": "indent", + "parsedDocstring": { + "text": "Specifies the number of spaces for indentation in the pretty-printed JSON output." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 106 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A tuple of (item_separator, key_separator). The default is (', ', ': ') if indent is None and (',', ': ')\notherwise." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 649, + "module": "storages._dataset", + "name": "separators", + "parsedDocstring": { + "text": "A tuple of (item_separator, key_separator). The default is (', ', ': ') if indent is None and (',', ': ')\notherwise." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 109 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "tuple", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "str" + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A function called for objects that can't be serialized otherwise. It should return a JSON-encodable version\nof the object or raise a `TypeError`." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 650, + "module": "storages._dataset", + "name": "default", + "parsedDocstring": { + "text": "A function called for objects that can't be serialized otherwise. It should return a JSON-encodable version\nof the object or raise a `TypeError`." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 113 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Callable" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Specifies whether the output JSON object should have keys sorted alphabetically." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 651, + "module": "storages._dataset", + "name": "sort_keys", + "parsedDocstring": { + "text": "Specifies whether the output JSON object should have keys sorted alphabetically." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 117 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A basic web crawler providing a framework for crawling websites.\n\nThe `BasicCrawler` provides a low-level functionality for crawling websites, allowing users to define their\nown page download and data extraction logic. It is designed mostly to be subclassed by crawlers with specific\npurposes. In most cases, you will want to use a more specialized crawler, such as `HttpCrawler`,\n`BeautifulSoupCrawler`, `ParselCrawler`, or `PlaywrightCrawler`. If you are an advanced user and want full\ncontrol over the crawling process, you can subclass the `BasicCrawler` and implement the request-handling logic\nyourself.\n\nThe crawling process begins with URLs provided by a `RequestProvider` instance. Each request is then\nhandled by a user-defined `request_handler` function, which processes the page and extracts the data.\n\nThe `BasicCrawler` includes several common features for crawling, such as:\n- automatic scaling based on the system resources,\n- retries for failed requests,\n- session management,\n- statistics tracking,\n- request routing via labels,\n- proxy rotation,\n- direct storage interaction helpers,\n- and more." + } + ] + }, + "decorations": [ + { + "args": "('Classes')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 2395, + 2456, + 2443, + 2468, + 2473, + 2479, + 2446, + 2463, + 2435, + 2439, + 2433, + 2449, + 2426, + 2452, + 2430 + ], + "title": "Methods" + }, + { + "children": [ + 2424, + 2425, + 2429 + ], + "title": "Properties" + } + ], + "id": 2394, + "module": "crawlers._basic._basic_crawler", + "name": "BasicCrawler", + "parsedDocstring": { + "text": "A basic web crawler providing a framework for crawling websites.\n\nThe `BasicCrawler` provides a low-level functionality for crawling websites, allowing users to define their\nown page download and data extraction logic. It is designed mostly to be subclassed by crawlers with specific\npurposes. In most cases, you will want to use a more specialized crawler, such as `HttpCrawler`,\n`BeautifulSoupCrawler`, `ParselCrawler`, or `PlaywrightCrawler`. If you are an advanced user and want full\ncontrol over the crawling process, you can subclass the `BasicCrawler` and implement the request-handling logic\nyourself.\n\nThe crawling process begins with URLs provided by a `RequestProvider` instance. Each request is then\nhandled by a user-defined `request_handler` function, which processes the page and extracts the data.\n\nThe `BasicCrawler` includes several common features for crawling, such as:\n- automatic scaling based on the system resources,\n- retries for failed requests,\n- session management,\n- statistics tracking,\n- request routing via labels,\n- proxy rotation,\n- direct storage interaction helpers,\n- and more." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 220 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedBy": [ + { + "name": "PlaywrightCrawler", + "target": "2222", + "type": "reference" + }, + { + "name": "AdaptivePlaywrightCrawler", + "target": "2583", + "type": "reference" + }, + { + "name": "AbstractHttpCrawler", + "target": "2666", + "type": "reference" + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a default comparator function for evaluating request handler results." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2485, + "module": "crawlers._adaptive_playwright._result_comparator", + "name": "create_default_comparator", + "parsedDocstring": { + "text": "Create a default comparator function for evaluating request handler results." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_adaptive_playwright/_result_comparator.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 11 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a default comparator function for evaluating request handler results." + } + ] + }, + "flags": {}, + "id": 2486, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "create_default_comparator", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2487, + "kind": 32768, + "kindString": "Parameter", + "name": "result_checker", + "type": { + "name": "Callable[[RequestHandlerRunResult], bool] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Callable", + "typeArguments": [ + { + "type": "reference", + "name": "[RequestHandlerRunResult]" + }, + { + "type": "reference", + "name": "bool" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "Callable", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "[RequestHandlerRunResult, RequestHandlerRunResult]" + }, + { + "type": "reference", + "name": "bool" + } + ] + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Compare results by comparing all their parts.\n\nComparison of `add_requests_calls` will consider same url requests with different parameters as different\nFor example following two request will be considered as different requests:\nhttps://sdk.apify.com/docs/guides/getting-started\nhttps://sdk.apify.com/docs/guides/getting-started?__hsfp=1136113150&__hssc=7591405.1.173549427712" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2488, + "module": "crawlers._adaptive_playwright._result_comparator", + "name": "full_result_comparator", + "parsedDocstring": { + "text": "Compare results by comparing all their parts.\n\nComparison of `add_requests_calls` will consider same url requests with different parameters as different\nFor example following two request will be considered as different requests:\nhttps://sdk.apify.com/docs/guides/getting-started\nhttps://sdk.apify.com/docs/guides/getting-started?__hsfp=1136113150&__hssc=7591405.1.173549427712" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_adaptive_playwright/_result_comparator.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 22 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Compare results by comparing all their parts.\n\nComparison of `add_requests_calls` will consider same url requests with different parameters as different\nFor example following two request will be considered as different requests:\nhttps://sdk.apify.com/docs/guides/getting-started\nhttps://sdk.apify.com/docs/guides/getting-started?__hsfp=1136113150&__hssc=7591405.1.173549427712" + } + ] + }, + "flags": {}, + "id": 2489, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "full_result_comparator", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2490, + "kind": 32768, + "kindString": "Parameter", + "name": "result_1", + "type": { + "name": "RequestHandlerRunResult", + "type": "reference", + "target": "212" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2491, + "kind": 32768, + "kindString": "Parameter", + "name": "result_2", + "type": { + "name": "RequestHandlerRunResult", + "type": "reference", + "target": "212" + } + } + ], + "type": { + "name": "bool", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Compare results by comparing their push data calls. Ignore other parts of results in comparison." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2492, + "module": "crawlers._adaptive_playwright._result_comparator", + "name": "push_data_only_comparator", + "parsedDocstring": { + "text": "Compare results by comparing their push data calls. Ignore other parts of results in comparison." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_adaptive_playwright/_result_comparator.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 37 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Compare results by comparing their push data calls. Ignore other parts of results in comparison." + } + ] + }, + "flags": {}, + "id": 2493, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "push_data_only_comparator", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2494, + "kind": 32768, + "kindString": "Parameter", + "name": "result_1", + "type": { + "name": "RequestHandlerRunResult", + "type": "reference", + "target": "212" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2495, + "kind": 32768, + "kindString": "Parameter", + "name": "result_2", + "type": { + "name": "RequestHandlerRunResult", + "type": "reference", + "target": "212" + } + } + ], + "type": { + "name": "bool", + "type": "reference" + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2496, + "module": "crawlers._adaptive_playwright._rendering_type_predictor", + "name": "UrlComponents", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 16 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2497, + "module": "crawlers._adaptive_playwright._rendering_type_predictor", + "name": "RenderingType", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 17 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2498, + "module": "crawlers._adaptive_playwright._rendering_type_predictor", + "name": "FeatureVector", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 18 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Recommended rendering type." + } + ] + }, + "flags": {}, + "groups": [], + "id": 2500, + "module": "crawlers._adaptive_playwright._rendering_type_predictor", + "name": "rendering_type", + "parsedDocstring": { + "text": "Recommended rendering type." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 26 + } + ], + "type": { + "name": "RenderingType", + "type": "reference", + "target": "2497" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Recommended rendering detection probability. Expected values between 0-1.\n\nZero represents absolute confidence in `rendering_type` recommendation.\nOne represents no confidence in `rendering_type` recommendation." + } + ] + }, + "flags": {}, + "groups": [], + "id": 2501, + "module": "crawlers._adaptive_playwright._rendering_type_predictor", + "name": "detection_probability_recommendation", + "parsedDocstring": { + "text": "Recommended rendering detection probability. Expected values between 0-1.\n\nZero represents absolute confidence in `rendering_type` recommendation.\nOne represents no confidence in `rendering_type` recommendation." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 28 + } + ], + "type": { + "name": "float", + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Rendering type recommendation with detection probability recommendation." + } + ] + }, + "decorations": [ + { + "args": "('Data structures')", + "name": "docs_group" + }, + { + "args": "(frozen=True)", + "name": "dataclass" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 2501, + 2500 + ], + "title": "Properties" + } + ], + "id": 2499, + "module": "crawlers._adaptive_playwright._rendering_type_predictor", + "name": "RenderingTypePrediction", + "parsedDocstring": { + "text": "Rendering type recommendation with detection probability recommendation." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 23 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get `RenderingTypePrediction` based on the input request.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2503, + "module": "crawlers._adaptive_playwright._rendering_type_predictor", + "name": "predict", + "parsedDocstring": { + "text": "Get `RenderingTypePrediction` based on the input request.\n", + "args": { + "request": "`Request` instance for which the prediction is made." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 40 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get `RenderingTypePrediction` based on the input request.\n" + } + ] + }, + "flags": {}, + "id": 2504, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "predict", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "`Request` instance for which the prediction is made." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2505, + "kind": 32768, + "kindString": "Parameter", + "name": "request", + "type": { + "name": "Request", + "type": "reference", + "target": "398" + } + } + ], + "type": { + "name": "RenderingTypePrediction", + "type": "reference", + "target": "2499" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Store prediction results and retrain the model.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2506, + "module": "crawlers._adaptive_playwright._rendering_type_predictor", + "name": "store_result", + "parsedDocstring": { + "text": "Store prediction results and retrain the model.\n", + "args": { + "request": "Used request.", + "rendering_type": "Known suitable `RenderingType`." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 48 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Store prediction results and retrain the model.\n" + } + ] + }, + "flags": {}, + "id": 2507, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "store_result", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Used request." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2508, + "kind": 32768, + "kindString": "Parameter", + "name": "request", + "type": { + "name": "Request", + "type": "reference", + "target": "398" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Known suitable `RenderingType`." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2509, + "kind": 32768, + "kindString": "Parameter", + "name": "rendering_type", + "type": { + "name": "RenderingType", + "type": "reference", + "target": "2497" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Stores rendering type for previously crawled URLs and predicts the rendering type for unvisited urls." + } + ] + }, + "decorations": [ + { + "args": "('Classes')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 2503, + 2506 + ], + "title": "Methods" + } + ], + "id": 2502, + "module": "crawlers._adaptive_playwright._rendering_type_predictor", + "name": "RenderingTypePredictor", + "parsedDocstring": { + "text": "Stores rendering type for previously crawled URLs and predicts the rendering type for unvisited urls." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 36 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedBy": [ + { + "name": "DefaultRenderingTypePredictor", + "target": "2510", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2511, + "module": "crawlers._adaptive_playwright._rendering_type_predictor", + "name": "__init__", + "parsedDocstring": { + "text": "Initialize a new instance.\n", + "args": { + "detection_ratio": "A number between 0 and 1 that determines the desired ratio of rendering type detections." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 65 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n" + } + ] + }, + "flags": {}, + "id": 2512, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A number between 0 and 1 that determines the desired ratio of rendering type detections." + } + ] + }, + "defaultValue": "0.1", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 2513, + "kind": 32768, + "kindString": "Parameter", + "name": "detection_ratio", + "type": { + "name": "float", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get `RenderingTypePrediction` based on the input request.\n" + } + ] + }, + "decorations": [ + { + "name": "override" + } + ], + "flags": {}, + "groups": [], + "id": 2514, + "module": "crawlers._adaptive_playwright._rendering_type_predictor", + "name": "predict", + "parsedDocstring": { + "text": "Get `RenderingTypePrediction` based on the input request.\n", + "args": { + "request": "`Request` instance for which the prediction is made." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 85 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get `RenderingTypePrediction` based on the input request.\n" + } + ] + }, + "flags": {}, + "id": 2515, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "predict", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "`Request` instance for which the prediction is made." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2516, + "kind": 32768, + "kindString": "Parameter", + "name": "request", + "type": { + "name": "Request", + "type": "reference", + "target": "398" + } + } + ], + "type": { + "name": "RenderingTypePrediction", + "type": "reference", + "target": "2499" + }, + "overwrites": { + "name": "RenderingTypePredictor.predict", + "target": 2503, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RenderingTypePredictor.predict", + "target": 2503, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Store prediction results and retrain the model.\n" + } + ] + }, + "decorations": [ + { + "name": "override" + } + ], + "flags": {}, + "groups": [], + "id": 2517, + "module": "crawlers._adaptive_playwright._rendering_type_predictor", + "name": "store_result", + "parsedDocstring": { + "text": "Store prediction results and retrain the model.\n", + "args": { + "request": "Used `Request` instance.", + "rendering_type": "Known suitable `RenderingType` for the used `Request` instance." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 116 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Store prediction results and retrain the model.\n" + } + ] + }, + "flags": {}, + "id": 2518, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "store_result", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Used `Request` instance." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2519, + "kind": 32768, + "kindString": "Parameter", + "name": "request", + "type": { + "name": "Request", + "type": "reference", + "target": "398" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Known suitable `RenderingType` for the used `Request` instance." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2520, + "kind": 32768, + "kindString": "Parameter", + "name": "rendering_type", + "type": { + "name": "RenderingType", + "type": "reference", + "target": "2497" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "RenderingTypePredictor.store_result", + "target": 2506, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RenderingTypePredictor.store_result", + "target": 2506, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Stores rendering type for previously crawled URLs and predicts the rendering type for unvisited urls.\n\n`RenderingTypePredictor` implementation based on logistic regression:\nhttps://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html" + } + ] + }, + "decorations": [ + { + "args": "('Classes')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 2511, + 2514, + 2517 + ], + "title": "Methods" + } + ], + "id": 2510, + "module": "crawlers._adaptive_playwright._rendering_type_predictor", + "name": "DefaultRenderingTypePredictor", + "parsedDocstring": { + "text": "Stores rendering type for previously crawled URLs and predicts the rendering type for unvisited urls.\n\n`RenderingTypePredictor` implementation based on logistic regression:\nhttps://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 58 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "RenderingTypePredictor", + "target": "2502", + "type": "reference" + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get list of url components where first component is host name." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2521, + "module": "crawlers._adaptive_playwright._rendering_type_predictor", + "name": "get_url_components", + "parsedDocstring": { + "text": "Get list of url components where first component is host name." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 157 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get list of url components where first component is host name." + } + ] + }, + "flags": {}, + "id": 2522, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "get_url_components", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2523, + "kind": 32768, + "kindString": "Parameter", + "name": "url", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "UrlComponents", + "type": "reference", + "target": "2496" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Calculate url similarity based on host name and path components similarity.\n\nReturn 0 if different host names.\nCompare path components using jaro-wrinkler method and assign 1 or 0 value based on similarity_cutoff for each\npath component. Return their weighted average." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2524, + "module": "crawlers._adaptive_playwright._rendering_type_predictor", + "name": "calculate_url_similarity", + "parsedDocstring": { + "text": "Calculate url similarity based on host name and path components similarity.\n\nReturn 0 if different host names.\nCompare path components using jaro-wrinkler method and assign 1 or 0 value based on similarity_cutoff for each\npath component. Return their weighted average." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 165 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Calculate url similarity based on host name and path components similarity.\n\nReturn 0 if different host names.\nCompare path components using jaro-wrinkler method and assign 1 or 0 value based on similarity_cutoff for each\npath component. Return their weighted average." + } + ] + }, + "flags": {}, + "id": 2525, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "calculate_url_similarity", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2526, + "kind": 32768, + "kindString": "Parameter", + "name": "url_1", + "type": { + "name": "UrlComponents", + "type": "reference", + "target": "2496" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2527, + "kind": 32768, + "kindString": "Parameter", + "name": "url_2", + "type": { + "name": "UrlComponents", + "type": "reference", + "target": "2496" + } + } + ], + "type": { + "name": "float", + "type": "reference" + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2528, + "module": "crawlers._adaptive_playwright._adaptive_playwright_crawling_context", + "name": "TStaticParseResult", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 23 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2529, + "module": "crawlers._adaptive_playwright._adaptive_playwright_crawling_context", + "name": "TStaticSelectResult", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 24 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2530, + "module": "crawlers._adaptive_playwright._adaptive_playwright_crawling_context", + "name": "AdaptiveContextError", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 27 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The Playwright `Page` object for the current page.\n\nRaises `AdaptiveContextError` if accessed during static crawling." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 2532, + "module": "crawlers._adaptive_playwright._adaptive_playwright_crawling_context", + "name": "page", + "parsedDocstring": { + "text": "The Playwright `Page` object for the current page.\n\nRaises `AdaptiveContextError` if accessed during static crawling." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 47 + } + ], + "type": { + "name": "Page", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A function to perform infinite scrolling on the page.\n\nThis scrolls to the bottom, triggering the loading of additional content if present.\nRaises `AdaptiveContextError` if accessed during static crawling." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 2533, + "module": "crawlers._adaptive_playwright._adaptive_playwright_crawling_context", + "name": "infinite_scroll", + "parsedDocstring": { + "text": "A function to perform infinite scrolling on the page.\n\nThis scrolls to the bottom, triggering the loading of additional content if present.\nRaises `AdaptiveContextError` if accessed during static crawling." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 57 + } + ], + "type": { + "name": "Callable", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "[]" + }, + { + "type": "reference", + "name": "Awaitable", + "typeArguments": [ + { + "type": "literal", + "value": null + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The Playwright `Response` object containing the response details for the current URL.\n\nRaises `AdaptiveContextError` if accessed during static crawling." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 2534, + "module": "crawlers._adaptive_playwright._adaptive_playwright_crawling_context", + "name": "response", + "parsedDocstring": { + "text": "The Playwright `Response` object containing the response details for the current URL.\n\nRaises `AdaptiveContextError` if accessed during static crawling." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 68 + } + ], + "type": { + "name": "Response", + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Locate element by css selector and return `None` once it is found.\n\nIf element is not found within timeout, `TimeoutError` is raised.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2535, + "module": "crawlers._adaptive_playwright._adaptive_playwright_crawling_context", + "name": "wait_for_selector", + "parsedDocstring": { + "text": "Locate element by css selector and return `None` once it is found.\n\nIf element is not found within timeout, `TimeoutError` is raised.\n", + "args": { + "selector": "Css selector to be used to locate specific element on page.", + "timeout": "Timeout that defines how long the function wait for the selector to appear." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 77 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Locate element by css selector and return `None` once it is found.\n\nIf element is not found within timeout, `TimeoutError` is raised.\n" + } + ] + }, + "flags": {}, + "id": 2536, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "wait_for_selector", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Css selector to be used to locate specific element on page." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2537, + "kind": 32768, + "kindString": "Parameter", + "name": "selector", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Timeout that defines how long the function wait for the selector to appear." + } + ] + }, + "defaultValue": "timedelta(seconds=5)", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 2538, + "kind": 32768, + "kindString": "Parameter", + "name": "timeout", + "type": { + "name": "timedelta", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Locate element by css selector and return first element found.\n\nIf element is not found within timeout, `TimeoutError` is raised.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2539, + "module": "crawlers._adaptive_playwright._adaptive_playwright_crawling_context", + "name": "query_selector_one", + "parsedDocstring": { + "text": "Locate element by css selector and return first element found.\n\nIf element is not found within timeout, `TimeoutError` is raised.\n", + "args": { + "selector": "Css selector to be used to locate specific element on page.", + "timeout": "Timeout that defines how long the function wait for the selector to appear.\n" + }, + "returns": "Result of used static parser `select` method." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 90 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "Result of used static parser `select` method." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Locate element by css selector and return first element found.\n\nIf element is not found within timeout, `TimeoutError` is raised.\n" + } + ] + }, + "flags": {}, + "id": 2540, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "query_selector_one", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Css selector to be used to locate specific element on page." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2541, + "kind": 32768, + "kindString": "Parameter", + "name": "selector", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Timeout that defines how long the function wait for the selector to appear.\n" + } + ] + }, + "defaultValue": "timedelta(seconds=5)", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 2542, + "kind": 32768, + "kindString": "Parameter", + "name": "timeout", + "type": { + "name": "timedelta", + "type": "reference" + } + } + ], + "type": { + "name": "TStaticSelectResult | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "TStaticSelectResult", + "target": "2529" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Locate element by css selector and return all elements found.\n\nIf element is not found within timeout, `TimeoutError` is raised.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2543, + "module": "crawlers._adaptive_playwright._adaptive_playwright_crawling_context", + "name": "query_selector_all", + "parsedDocstring": { + "text": "Locate element by css selector and return all elements found.\n\nIf element is not found within timeout, `TimeoutError` is raised.\n", + "args": { + "selector": "Css selector to be used to locate specific element on page.", + "timeout": "Timeout that defines how long the function wait for the selector to appear.\n" + }, + "returns": "List of results of used static parser `select` method." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 108 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "List of results of used static parser `select` method." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Locate element by css selector and return all elements found.\n\nIf element is not found within timeout, `TimeoutError` is raised.\n" + } + ] + }, + "flags": {}, + "id": 2544, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "query_selector_all", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Css selector to be used to locate specific element on page." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2545, + "kind": 32768, + "kindString": "Parameter", + "name": "selector", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Timeout that defines how long the function wait for the selector to appear.\n" + } + ] + }, + "defaultValue": "timedelta(seconds=5)", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 2546, + "kind": 32768, + "kindString": "Parameter", + "name": "timeout", + "type": { + "name": "timedelta", + "type": "reference" + } + } + ], + "type": { + "name": "Sequence", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "TStaticSelectResult", + "target": "2529" + } + ] + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Parse whole page with static parser. If `selector` argument is used, wait for selector first.\n\nIf element is not found within timeout, TimeoutError is raised.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2547, + "module": "crawlers._adaptive_playwright._adaptive_playwright_crawling_context", + "name": "parse_with_static_parser", + "parsedDocstring": { + "text": "Parse whole page with static parser. If `selector` argument is used, wait for selector first.\n\nIf element is not found within timeout, TimeoutError is raised.\n", + "args": { + "selector": "css selector to be used to locate specific element on page.", + "timeout": "timeout that defines how long the function wait for the selector to appear.\n" + }, + "returns": "Result of used static parser `parse_text` method." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 146 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "Result of used static parser `parse_text` method." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Parse whole page with static parser. If `selector` argument is used, wait for selector first.\n\nIf element is not found within timeout, TimeoutError is raised.\n" + } + ] + }, + "flags": {}, + "id": 2548, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "parse_with_static_parser", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "css selector to be used to locate specific element on page." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 2549, + "kind": 32768, + "kindString": "Parameter", + "name": "selector", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "timeout that defines how long the function wait for the selector to appear.\n" + } + ] + }, + "defaultValue": "timedelta(seconds=5)", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 2550, + "kind": 32768, + "kindString": "Parameter", + "name": "timeout", + "type": { + "name": "timedelta", + "type": "reference" + } + } + ], + "type": { + "name": "TStaticParseResult", + "type": "reference", + "target": "2528" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance from an existing `ParsedHttpCrawlingContext`." + } + ] + }, + "decorations": [ + { + "name": "classmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2551, + "module": "crawlers._adaptive_playwright._adaptive_playwright_crawling_context", + "name": "from_parsed_http_crawling_context", + "parsedDocstring": { + "text": "Initialize a new instance from an existing `ParsedHttpCrawlingContext`." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 167 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance from an existing `ParsedHttpCrawlingContext`." + } + ] + }, + "flags": {}, + "id": 2552, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "from_parsed_http_crawling_context", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2553, + "kind": 32768, + "kindString": "Parameter", + "name": "context", + "type": { + "name": "ParsedHttpCrawlingContext", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "TStaticParseResult", + "target": "2528" + } + ], + "target": "2632" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2554, + "kind": 32768, + "kindString": "Parameter", + "name": "parser", + "type": { + "name": "AbstractHttpParser", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "TStaticParseResult", + "target": "2528" + }, + { + "type": "reference", + "name": "TStaticSelectResult", + "target": "2529" + } + ], + "target": "2642" + } + } + ], + "type": { + "name": "AdaptivePlaywrightCrawlingContext", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "TStaticParseResult", + "target": "2528" + }, + { + "type": "reference", + "name": "TStaticSelectResult", + "target": "2529" + } + ], + "target": "2531" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance from an existing `PlaywrightCrawlingContext`." + } + ] + }, + "decorations": [ + { + "name": "classmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2555, + "module": "crawlers._adaptive_playwright._adaptive_playwright_crawling_context", + "name": "from_playwright_crawling_context", + "parsedDocstring": { + "text": "Initialize a new instance from an existing `PlaywrightCrawlingContext`." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 176 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance from an existing `PlaywrightCrawlingContext`." + } + ] + }, + "flags": {}, + "id": 2556, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "from_playwright_crawling_context", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2557, + "kind": 32768, + "kindString": "Parameter", + "name": "context", + "type": { + "name": "PlaywrightCrawlingContext", + "type": "reference", + "target": "2215" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2558, + "kind": 32768, + "kindString": "Parameter", + "name": "parser", + "type": { + "name": "AbstractHttpParser", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "TStaticParseResult", + "target": "2528" + }, + { + "type": "reference", + "name": "TStaticSelectResult", + "target": "2529" + } + ], + "target": "2642" + } + } + ], + "type": { + "name": "AdaptivePlaywrightCrawlingContext", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "TStaticParseResult", + "target": "2528" + }, + { + "type": "reference", + "name": "TStaticSelectResult", + "target": "2529" + } + ], + "target": "2531" + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3487, + "module": "crawlers._abstract_http._http_crawling_context", + "name": "parsed_content", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_abstract_http/_http_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 40 + } + ], + "type": { + "name": "TParseResult", + "type": "reference", + "target": "2623" + }, + "inheritedFrom": { + "name": "ParsedHttpCrawlingContext.parsed_content", + "target": 2633, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3488, + "module": "crawlers._abstract_http._http_crawling_context", + "name": "enqueue_links", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_abstract_http/_http_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 41 + } + ], + "type": { + "name": "EnqueueLinksFunction", + "type": "reference", + "target": "235" + }, + "inheritedFrom": { + "name": "ParsedHttpCrawlingContext.enqueue_links", + "target": 2634, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3489, + "module": "crawlers._abstract_http._http_crawling_context", + "name": "extract_links", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_abstract_http/_http_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 42 + } + ], + "type": { + "name": "ExtractLinksFunction", + "type": "reference", + "target": "253" + }, + "inheritedFrom": { + "name": "ParsedHttpCrawlingContext.extract_links", + "target": 2635, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance from an existing `HttpCrawlingContext`." + } + ] + }, + "decorations": [ + { + "name": "classmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3490, + "module": "crawlers._abstract_http._http_crawling_context", + "name": "from_http_crawling_context", + "parsedDocstring": { + "text": "Initialize a new instance from an existing `HttpCrawlingContext`." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_abstract_http/_http_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 45 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance from an existing `HttpCrawlingContext`." + } + ] + }, + "flags": {}, + "id": 2637, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "from_http_crawling_context", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2638, + "kind": 32768, + "kindString": "Parameter", + "name": "context", + "type": { + "name": "HttpCrawlingContext", + "type": "reference", + "target": "2625" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2639, + "kind": 32768, + "kindString": "Parameter", + "name": "parsed_content", + "type": { + "name": "TParseResult", + "type": "reference", + "target": "2623" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2640, + "kind": 32768, + "kindString": "Parameter", + "name": "enqueue_links", + "type": { + "name": "EnqueueLinksFunction", + "type": "reference", + "target": "235" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2641, + "kind": 32768, + "kindString": "Parameter", + "name": "extract_links", + "type": { + "name": "ExtractLinksFunction", + "type": "reference", + "target": "253" + } + } + ], + "type": { + "name": "Self", + "type": "reference" + }, + "inheritedFrom": { + "name": "ParsedHttpCrawlingContext.from_http_crawling_context", + "target": 2636, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "ParsedHttpCrawlingContext.from_http_crawling_context", + "target": 2636, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance from an existing `BasicCrawlingContext`." + } + ] + }, + "decorations": [ + { + "name": "classmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3491, + "module": "crawlers._abstract_http._http_crawling_context", + "name": "from_basic_crawling_context", + "parsedDocstring": { + "text": "Initialize a new instance from an existing `BasicCrawlingContext`." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_abstract_http/_http_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 22 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance from an existing `BasicCrawlingContext`." + } + ] + }, + "flags": {}, + "id": 2627, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "from_basic_crawling_context", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2628, + "kind": 32768, + "kindString": "Parameter", + "name": "context", + "type": { + "name": "BasicCrawlingContext", + "type": "reference", + "target": "309" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2629, + "kind": 32768, + "kindString": "Parameter", + "name": "http_response", + "type": { + "name": "HttpResponse", + "type": "reference", + "target": "1909" + } + } + ], + "type": { + "name": "Self", + "type": "reference" + }, + "inheritedFrom": { + "name": "HttpCrawlingContext.from_basic_crawling_context", + "target": 2626, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "HttpCrawlingContext.from_basic_crawling_context", + "target": 2626, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get snapshot of crawled page." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3492, + "module": "crawlers._abstract_http._http_crawling_context", + "name": "get_snapshot", + "parsedDocstring": { + "text": "Get snapshot of crawled page." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_abstract_http/_http_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 27 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get snapshot of crawled page." + } + ] + }, + "flags": {}, + "id": 2631, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_snapshot", + "parameters": [], + "type": { + "name": "PageSnapshot", + "type": "reference", + "target": "297" + }, + "overwrites": { + "name": "BasicCrawlingContext.get_snapshot", + "target": 319, + "type": "reference" + }, + "inheritedFrom": { + "name": "HttpCrawlingContext.get_snapshot", + "target": 2630, + "type": "reference" + } + } + ], + "overwrites": { + "name": "BasicCrawlingContext.get_snapshot", + "target": 319, + "type": "reference" + }, + "inheritedFrom": { + "name": "HttpCrawlingContext.get_snapshot", + "target": 2630, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The HTTP response received from the server." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3493, + "module": "http_clients._base", + "name": "http_response", + "parsedDocstring": { + "text": "The HTTP response received from the server." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/http_clients/_base.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 70 + } + ], + "type": { + "name": "HttpResponse", + "type": "reference", + "target": "1909" + }, + "inheritedFrom": { + "name": "HttpCrawlingResult.http_response", + "target": 1918, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Request object for the current page being processed." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3494, + "module": "_types", + "name": "request", + "parsedDocstring": { + "text": "Request object for the current page being processed." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 627 + } + ], + "type": { + "name": "Request", + "type": "reference", + "target": "398" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.request", + "target": 310, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Session object for the current page being processed." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3495, + "module": "_types", + "name": "session", + "parsedDocstring": { + "text": "Session object for the current page being processed." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 630 + } + ], + "type": { + "name": "Session | None", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.session", + "target": 311, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Proxy information for the current page being processed." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3496, + "module": "_types", + "name": "proxy_info", + "parsedDocstring": { + "text": "Proxy information for the current page being processed." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 633 + } + ], + "type": { + "name": "ProxyInfo | None", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.proxy_info", + "target": 312, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Send request crawling context helper function." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3497, + "module": "_types", + "name": "send_request", + "parsedDocstring": { + "text": "Send request crawling context helper function." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 636 + } + ], + "type": { + "name": "SendRequestFunction", + "type": "reference", + "target": "290" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.send_request", + "target": 313, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Add requests crawling context helper function." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3498, + "module": "_types", + "name": "add_requests", + "parsedDocstring": { + "text": "Add requests crawling context helper function." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 639 + } + ], + "type": { + "name": "AddRequestsFunction", + "type": "reference", + "target": "230" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.add_requests", + "target": 314, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Push data crawling context helper function." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3499, + "module": "_types", + "name": "push_data", + "parsedDocstring": { + "text": "Push data crawling context helper function." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 642 + } + ], + "type": { + "name": "PushDataFunction", + "type": "reference", + "target": "283" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.push_data", + "target": 315, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Use state crawling context helper function." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3500, + "module": "_types", + "name": "use_state", + "parsedDocstring": { + "text": "Use state crawling context helper function." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 645 + } + ], + "type": { + "name": "UseStateFunction", + "type": "reference", + "target": "305" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.use_state", + "target": 316, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get key-value store crawling context helper function." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3501, + "module": "_types", + "name": "get_key_value_store", + "parsedDocstring": { + "text": "Get key-value store crawling context helper function." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 648 + } + ], + "type": { + "name": "GetKeyValueStoreFromRequestHandlerFunction", + "type": "reference", + "target": "278" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.get_key_value_store", + "target": 317, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Logger instance." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3502, + "module": "_types", + "name": "log", + "parsedDocstring": { + "text": "Logger instance." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 651 + } + ], + "type": { + "name": "logging.Logger", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.log", + "target": 318, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return hash of the context. Each context is considered unique." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3503, + "module": "_types", + "name": "__hash__", + "parsedDocstring": { + "text": "Return hash of the context. Each context is considered unique." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 658 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return hash of the context. Each context is considered unique." + } + ] + }, + "flags": {}, + "id": 322, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__hash__", + "parameters": [], + "type": { + "name": "int", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.__hash__", + "target": 321, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawlingContext.__hash__", + "target": 321, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "args": "(frozen=True)", + "name": "dataclass" + }, + { + "args": "('Data structures')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 3503, + 3491, + 3490, + 2551, + 2555, + 3492, + 2547, + 2543, + 2539, + 2535 + ], + "title": "Methods" + }, + { + "children": [ + 3498, + 3488, + 3489, + 3501, + 3493, + 2533, + 3502, + 2532, + 3487, + 3496, + 3499, + 3494, + 2534, + 3497, + 3495, + 3500 + ], + "title": "Properties" + } + ], + "id": 2531, + "module": "crawlers._adaptive_playwright._adaptive_playwright_crawling_context", + "name": "AdaptivePlaywrightCrawlingContext", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 33 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "ParsedHttpCrawlingContext", + "target": "2632", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Blocks network requests matching specified URL patterns." + } + ] + }, + "flags": {}, + "groups": [], + "id": 2560, + "module": "crawlers._adaptive_playwright._adaptive_playwright_crawling_context", + "name": "block_requests", + "parsedDocstring": { + "text": "Blocks network requests matching specified URL patterns." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 211 + } + ], + "type": { + "name": "BlockRequestsFunction | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "BlockRequestsFunction", + "target": "2162" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The Playwright `Page` object for the current page.\n\nRaises `AdaptiveContextError` if accessed during static crawling." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 2561, + "module": "crawlers._adaptive_playwright._adaptive_playwright_crawling_context", + "name": "page", + "parsedDocstring": { + "text": "The Playwright `Page` object for the current page.\n\nRaises `AdaptiveContextError` if accessed during static crawling." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 215 + } + ], + "type": { + "name": "Page", + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance from an existing pre-navigation `BasicCrawlingContext`." + } + ] + }, + "decorations": [ + { + "name": "classmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2562, + "module": "crawlers._adaptive_playwright._adaptive_playwright_crawling_context", + "name": "from_pre_navigation_context", + "parsedDocstring": { + "text": "Initialize a new instance from an existing pre-navigation `BasicCrawlingContext`." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 229 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance from an existing pre-navigation `BasicCrawlingContext`." + } + ] + }, + "flags": {}, + "id": 2563, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "from_pre_navigation_context", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2564, + "kind": 32768, + "kindString": "Parameter", + "name": "context", + "type": { + "name": "BasicCrawlingContext", + "type": "reference", + "target": "309" + } + } + ], + "type": { + "name": "Self", + "type": "reference" + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Request object for the current page being processed." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3419, + "module": "_types", + "name": "request", + "parsedDocstring": { + "text": "Request object for the current page being processed." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 627 + } + ], + "type": { + "name": "Request", + "type": "reference", + "target": "398" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.request", + "target": 310, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Session object for the current page being processed." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3420, + "module": "_types", + "name": "session", + "parsedDocstring": { + "text": "Session object for the current page being processed." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 630 + } + ], + "type": { + "name": "Session | None", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.session", + "target": 311, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Proxy information for the current page being processed." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3421, + "module": "_types", + "name": "proxy_info", + "parsedDocstring": { + "text": "Proxy information for the current page being processed." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 633 + } + ], + "type": { + "name": "ProxyInfo | None", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.proxy_info", + "target": 312, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Send request crawling context helper function." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3422, + "module": "_types", + "name": "send_request", + "parsedDocstring": { + "text": "Send request crawling context helper function." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 636 + } + ], + "type": { + "name": "SendRequestFunction", + "type": "reference", + "target": "290" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.send_request", + "target": 313, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Add requests crawling context helper function." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3423, + "module": "_types", + "name": "add_requests", + "parsedDocstring": { + "text": "Add requests crawling context helper function." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 639 + } + ], + "type": { + "name": "AddRequestsFunction", + "type": "reference", + "target": "230" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.add_requests", + "target": 314, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Push data crawling context helper function." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3424, + "module": "_types", + "name": "push_data", + "parsedDocstring": { + "text": "Push data crawling context helper function." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 642 + } + ], + "type": { + "name": "PushDataFunction", + "type": "reference", + "target": "283" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.push_data", + "target": 315, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Use state crawling context helper function." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3425, + "module": "_types", + "name": "use_state", + "parsedDocstring": { + "text": "Use state crawling context helper function." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 645 + } + ], + "type": { + "name": "UseStateFunction", + "type": "reference", + "target": "305" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.use_state", + "target": 316, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get key-value store crawling context helper function." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3426, + "module": "_types", + "name": "get_key_value_store", + "parsedDocstring": { + "text": "Get key-value store crawling context helper function." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 648 + } + ], + "type": { + "name": "GetKeyValueStoreFromRequestHandlerFunction", + "type": "reference", + "target": "278" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.get_key_value_store", + "target": 317, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Logger instance." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3427, + "module": "_types", + "name": "log", + "parsedDocstring": { + "text": "Logger instance." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 651 + } + ], + "type": { + "name": "logging.Logger", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.log", + "target": 318, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get snapshot of crawled page." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3428, + "module": "_types", + "name": "get_snapshot", + "parsedDocstring": { + "text": "Get snapshot of crawled page." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 654 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get snapshot of crawled page." + } + ] + }, + "flags": {}, + "id": 320, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_snapshot", + "parameters": [], + "type": { + "name": "PageSnapshot", + "type": "reference", + "target": "297" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.get_snapshot", + "target": 319, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawlingContext.get_snapshot", + "target": 319, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return hash of the context. Each context is considered unique." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3429, + "module": "_types", + "name": "__hash__", + "parsedDocstring": { + "text": "Return hash of the context. Each context is considered unique." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 658 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return hash of the context. Each context is considered unique." + } + ] + }, + "flags": {}, + "id": 322, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__hash__", + "parameters": [], + "type": { + "name": "int", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.__hash__", + "target": 321, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawlingContext.__hash__", + "target": 321, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A wrapper around BasicCrawlingContext or AdaptivePlaywrightCrawlingContext.\n\nTrying to access `page` on this context will raise AdaptiveContextError if wrapped context is BasicCrawlingContext." + } + ] + }, + "decorations": [ + { + "args": "(frozen=True)", + "name": "dataclass" + }, + { + "args": "('Data structures')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 3429, + 2562, + 3428 + ], + "title": "Methods" + }, + { + "children": [ + 3423, + 2560, + 3426, + 3427, + 2561, + 3421, + 3424, + 3419, + 3422, + 3420, + 3425 + ], + "title": "Properties" + } + ], + "id": 2559, + "module": "crawlers._adaptive_playwright._adaptive_playwright_crawling_context", + "name": "AdaptivePlaywrightPreNavCrawlingContext", + "parsedDocstring": { + "text": "A wrapper around BasicCrawlingContext or AdaptivePlaywrightCrawlingContext.\n\nTrying to access `page` on this context will raise AdaptiveContextError if wrapped context is BasicCrawlingContext." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 204 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "BasicCrawlingContext", + "target": "309", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2566, + "module": "statistics._models", + "name": "model_config", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 15 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "overwrites": { + "name": "StatisticsState.model_config", + "target": 1471, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Number representing how many times static http based crawling was used." + } + ] + }, + "flags": {}, + "groups": [], + "id": 2567, + "module": "crawlers._adaptive_playwright._adaptive_playwright_crawler_statistics", + "name": "http_only_request_handler_runs", + "parsedDocstring": { + "text": "Number representing how many times static http based crawling was used." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 17 + } + ], + "type": { + "name": "int", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Number representing how many times browser based crawling was used." + } + ] + }, + "flags": {}, + "groups": [], + "id": 2568, + "module": "crawlers._adaptive_playwright._adaptive_playwright_crawler_statistics", + "name": "browser_request_handler_runs", + "parsedDocstring": { + "text": "Number representing how many times browser based crawling was used." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 20 + } + ], + "type": { + "name": "int", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Number representing how many times the predictor gave incorrect prediction." + } + ] + }, + "flags": {}, + "groups": [], + "id": 2569, + "module": "crawlers._adaptive_playwright._adaptive_playwright_crawler_statistics", + "name": "rendering_type_mispredictions", + "parsedDocstring": { + "text": "Number representing how many times the predictor gave incorrect prediction." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 23 + } + ], + "type": { + "name": "int", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3229, + "module": "statistics._models", + "name": "stats_id", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 53 + } + ], + "type": { + "name": "Annotated[int | None, Field(alias='statsId')]", + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + }, + "inheritedFrom": { + "name": "StatisticsState.stats_id", + "target": 1472, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3230, + "module": "statistics._models", + "name": "requests_finished", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 55 + } + ], + "type": { + "name": "int", + "type": "reference" + }, + "inheritedFrom": { + "name": "StatisticsState.requests_finished", + "target": 1473, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3231, + "module": "statistics._models", + "name": "requests_failed", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 56 + } + ], + "type": { + "name": "int", + "type": "reference" + }, + "inheritedFrom": { + "name": "StatisticsState.requests_failed", + "target": 1474, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3232, + "module": "statistics._models", + "name": "requests_retries", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 57 + } + ], + "type": { + "name": "int", + "type": "reference" + }, + "inheritedFrom": { + "name": "StatisticsState.requests_retries", + "target": 1475, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3233, + "module": "statistics._models", + "name": "requests_failed_per_minute", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 58 + } + ], + "type": { + "name": "float", + "type": "reference" + }, + "inheritedFrom": { + "name": "StatisticsState.requests_failed_per_minute", + "target": 1476, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3234, + "module": "statistics._models", + "name": "requests_finished_per_minute", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 59 + } + ], + "type": { + "name": "float", + "type": "reference" + }, + "inheritedFrom": { + "name": "StatisticsState.requests_finished_per_minute", + "target": 1477, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3235, + "module": "statistics._models", + "name": "request_min_duration", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 60 + } + ], + "type": { + "name": "Annotated[timedelta_ms | None, Field(alias='requestMinDurationMillis')]", + "type": "union", + "types": [ + { + "type": "reference", + "name": "timedelta_ms", + "target": "2973" + }, + { + "type": "literal", + "value": null + } + ] + }, + "inheritedFrom": { + "name": "StatisticsState.request_min_duration", + "target": 1478, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3236, + "module": "statistics._models", + "name": "request_max_duration", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 61 + } + ], + "type": { + "name": "Annotated[timedelta_ms | None, Field(alias='requestMaxDurationMillis')]", + "type": "union", + "types": [ + { + "type": "reference", + "name": "timedelta_ms", + "target": "2973" + }, + { + "type": "literal", + "value": null + } + ] + }, + "inheritedFrom": { + "name": "StatisticsState.request_max_duration", + "target": 1479, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3237, + "module": "statistics._models", + "name": "request_total_failed_duration", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 62 + } + ], + "type": { + "name": "timedelta_ms", + "type": "reference", + "target": "2973" + }, + "inheritedFrom": { + "name": "StatisticsState.request_total_failed_duration", + "target": 1480, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3238, + "module": "statistics._models", + "name": "request_total_finished_duration", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 65 + } + ], + "type": { + "name": "timedelta_ms", + "type": "reference", + "target": "2973" + }, + "inheritedFrom": { + "name": "StatisticsState.request_total_finished_duration", + "target": 1481, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3239, + "module": "statistics._models", + "name": "crawler_started_at", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 68 + } + ], + "type": { + "name": "Annotated[datetime | None, Field(alias='crawlerStartedAt')]", + "type": "union", + "types": [ + { + "type": "reference", + "name": "datetime" + }, + { + "type": "literal", + "value": null + } + ] + }, + "inheritedFrom": { + "name": "StatisticsState.crawler_started_at", + "target": 1482, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3240, + "module": "statistics._models", + "name": "crawler_last_started_at", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 69 + } + ], + "type": { + "name": "Annotated[datetime | None, Field(alias='crawlerLastStartTimestamp')]", + "type": "union", + "types": [ + { + "type": "reference", + "name": "datetime" + }, + { + "type": "literal", + "value": null + } + ] + }, + "inheritedFrom": { + "name": "StatisticsState.crawler_last_started_at", + "target": 1483, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3241, + "module": "statistics._models", + "name": "crawler_finished_at", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 70 + } + ], + "type": { + "name": "Annotated[datetime | None, Field(alias='crawlerFinishedAt')]", + "type": "union", + "types": [ + { + "type": "reference", + "name": "datetime" + }, + { + "type": "literal", + "value": null + } + ] + }, + "inheritedFrom": { + "name": "StatisticsState.crawler_finished_at", + "target": 1484, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3242, + "module": "statistics._models", + "name": "crawler_runtime", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 71 + } + ], + "type": { + "name": "timedelta_ms", + "type": "reference", + "target": "2973" + }, + "inheritedFrom": { + "name": "StatisticsState.crawler_runtime", + "target": 1485, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3243, + "module": "statistics._models", + "name": "errors", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 72 + } + ], + "type": { + "name": "dict", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Any" + } + ] + }, + "inheritedFrom": { + "name": "StatisticsState.errors", + "target": 1486, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3244, + "module": "statistics._models", + "name": "retry_errors", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 73 + } + ], + "type": { + "name": "dict", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Any" + } + ] + }, + "inheritedFrom": { + "name": "StatisticsState.retry_errors", + "target": 1487, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3245, + "module": "statistics._models", + "name": "requests_with_status_code", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 74 + } + ], + "type": { + "name": "dict", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "int" + } + ] + }, + "inheritedFrom": { + "name": "StatisticsState.requests_with_status_code", + "target": 1488, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3246, + "module": "statistics._models", + "name": "stats_persisted_at", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 75 + } + ], + "type": { + "name": "Annotated[ datetime | None, Field(alias='statsPersistedAt'), PlainSerializer(lambda _: datetime.now(timezone.utc)) ]", + "type": "union", + "types": [ + { + "type": "reference", + "name": "datetime" + }, + { + "type": "literal", + "value": null + } + ] + }, + "inheritedFrom": { + "name": "StatisticsState.stats_persisted_at", + "target": 1489, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3247, + "module": "statistics._models", + "name": "request_retry_histogram", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 78 + } + ], + "type": { + "name": "dict", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "reference", + "name": "int" + } + ] + }, + "inheritedFrom": { + "name": "StatisticsState.request_retry_histogram", + "target": 1490, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "args": "(alias='requestTotalDurationMillis', return_type=timedelta_ms)", + "name": "computed_field" + }, + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 3248, + "module": "statistics._models", + "name": "request_total_duration", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 90 + } + ], + "type": { + "name": "timedelta", + "type": "reference" + }, + "inheritedFrom": { + "name": "StatisticsState.request_total_duration", + "target": 1491, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "args": "(alias='requestAvgFailedDurationMillis', return_type=Optional[timedelta_ms])", + "name": "computed_field" + }, + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 3249, + "module": "statistics._models", + "name": "request_avg_failed_duration", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 95 + } + ], + "type": { + "name": "timedelta | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "timedelta" + }, + { + "type": "literal", + "value": null + } + ] + }, + "inheritedFrom": { + "name": "StatisticsState.request_avg_failed_duration", + "target": 1492, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "args": "(alias='requestAvgFinishedDurationMillis', return_type=Optional[timedelta_ms])", + "name": "computed_field" + }, + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 3250, + "module": "statistics._models", + "name": "request_avg_finished_duration", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 100 + } + ], + "type": { + "name": "timedelta | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "timedelta" + }, + { + "type": "literal", + "value": null + } + ] + }, + "inheritedFrom": { + "name": "StatisticsState.request_avg_finished_duration", + "target": 1493, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "args": "(alias='requestsTotal')", + "name": "computed_field" + }, + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 3251, + "module": "statistics._models", + "name": "requests_total", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 105 + } + ], + "type": { + "name": "int", + "type": "reference" + }, + "inheritedFrom": { + "name": "StatisticsState.requests_total", + "target": 1494, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Statistic data about a crawler run with additional information related to adaptive crawling." + } + ] + }, + "decorations": [ + { + "args": "('Data structures')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 2568, + 3241, + 3240, + 3242, + 3239, + 3243, + 2567, + 2566, + 2569, + 3249, + 3250, + 3236, + 3235, + 3247, + 3248, + 3237, + 3238, + 3231, + 3233, + 3230, + 3234, + 3232, + 3251, + 3245, + 3244, + 3229, + 3246 + ], + "title": "Properties" + } + ], + "id": 2565, + "module": "crawlers._adaptive_playwright._adaptive_playwright_crawler_statistics", + "name": "AdaptivePlaywrightCrawlerStatisticState", + "parsedDocstring": { + "text": "Statistic data about a crawler run with additional information related to adaptive crawling." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 12 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "StatisticsState", + "target": "1470", + "type": "reference" + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2570, + "module": "crawlers._adaptive_playwright._adaptive_playwright_crawler", + "name": "TStaticParseResult", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 57 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2571, + "module": "crawlers._adaptive_playwright._adaptive_playwright_crawler", + "name": "TStaticSelectResult", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 58 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2572, + "module": "crawlers._adaptive_playwright._adaptive_playwright_crawler", + "name": "TStaticCrawlingContext", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 59 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2574, + "module": "statistics._statistics", + "name": "__init__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 68 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 1405, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1406, + "kind": 32768, + "kindString": "Parameter", + "name": "persistence_enabled", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1407, + "kind": 32768, + "kindString": "Parameter", + "name": "persist_state_kvs_name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1408, + "kind": 32768, + "kindString": "Parameter", + "name": "persist_state_key", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "'Statistics'", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1409, + "kind": 32768, + "kindString": "Parameter", + "name": "log_message", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1410, + "kind": 32768, + "kindString": "Parameter", + "name": "periodic_message_logger", + "type": { + "name": "Logger | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Logger" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "timedelta(minutes=1)", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1411, + "kind": 32768, + "kindString": "Parameter", + "name": "log_interval", + "type": { + "name": "timedelta", + "type": "reference" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 1412, + "kind": 32768, + "kindString": "Parameter", + "name": "state_model", + "type": { + "name": "type", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "TStatisticsState", + "target": "1392" + } + ] + } + }, + { + "defaultValue": "'table'", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1413, + "kind": 32768, + "kindString": "Parameter", + "name": "statistics_log_format", + "type": { + "name": "Literal", + "type": "reference", + "typeArguments": [ + { + "type": "literal", + "value": "table" + }, + { + "type": "literal", + "value": "inline" + } + ] + } + }, + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1414, + "kind": 32768, + "kindString": "Parameter", + "name": "save_error_snapshots", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "Statistics.__init__", + "target": 1404, + "type": "reference" + } + } + ], + "overwrites": { + "name": "Statistics.__init__", + "target": 1404, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Subscribe to events and start collecting statistics.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2576, + "module": "statistics._statistics", + "name": "__aenter__", + "parsedDocstring": { + "text": "Subscribe to events and start collecting statistics.\n" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 71 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Subscribe to events and start collecting statistics.\n" + } + ] + }, + "flags": {}, + "id": 1430, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "__aenter__", + "parameters": [], + "type": { + "name": "Self", + "type": "reference" + }, + "overwrites": { + "name": "Statistics.__aenter__", + "target": 1429, + "type": "reference" + } + } + ], + "overwrites": { + "name": "Statistics.__aenter__", + "target": 1429, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Stop collecting statistics.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2578, + "module": "statistics._statistics", + "name": "__aexit__", + "parsedDocstring": { + "text": "Stop collecting statistics.\n" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 77 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Stop collecting statistics.\n" + } + ] + }, + "flags": {}, + "id": 1432, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "__aexit__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1433, + "kind": 32768, + "kindString": "Parameter", + "name": "exc_type", + "type": { + "name": "type[BaseException] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "type", + "typeArguments": [ + { + "type": "reference", + "name": "BaseException" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1434, + "kind": 32768, + "kindString": "Parameter", + "name": "exc_value", + "type": { + "name": "BaseException | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "BaseException" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1435, + "kind": 32768, + "kindString": "Parameter", + "name": "exc_traceback", + "type": { + "name": "TracebackType | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "TracebackType" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "Statistics.__aexit__", + "target": 1431, + "type": "reference" + } + } + ], + "overwrites": { + "name": "Statistics.__aexit__", + "target": 1431, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create near copy of the `Statistics` with replaced `state_model`." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3219, + "module": "statistics._statistics", + "name": "replace_state_model", + "parsedDocstring": { + "text": "Create near copy of the `Statistics` with replaced `state_model`." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_statistics.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 109 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create near copy of the `Statistics` with replaced `state_model`." + } + ] + }, + "flags": {}, + "id": 1416, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "replace_state_model", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1417, + "kind": 32768, + "kindString": "Parameter", + "name": "state_model", + "type": { + "name": "type[TNewStatisticsState]", + "type": "reference" + } + } + ], + "type": { + "name": "Statistics[TNewStatisticsState]", + "type": "reference" + }, + "inheritedFrom": { + "name": "Statistics.replace_state_model", + "target": 1415, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "Statistics.replace_state_model", + "target": 1415, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance with default state model `StatisticsState`." + } + ] + }, + "decorations": [ + { + "name": "staticmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3220, + "module": "statistics._statistics", + "name": "with_default_state", + "parsedDocstring": { + "text": "Initialize a new instance with default state model `StatisticsState`." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_statistics.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 123 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance with default state model `StatisticsState`." + } + ] + }, + "flags": {}, + "id": 1419, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "with_default_state", + "parameters": [ + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1420, + "kind": 32768, + "kindString": "Parameter", + "name": "persistence_enabled", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1421, + "kind": 32768, + "kindString": "Parameter", + "name": "persist_state_kvs_name", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1422, + "kind": 32768, + "kindString": "Parameter", + "name": "persist_state_key", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "defaultValue": "'Statistics'", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1423, + "kind": 32768, + "kindString": "Parameter", + "name": "log_message", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1424, + "kind": 32768, + "kindString": "Parameter", + "name": "periodic_message_logger", + "type": { + "name": "Logger | None", + "type": "reference" + } + }, + { + "defaultValue": "timedelta(minutes=1)", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1425, + "kind": 32768, + "kindString": "Parameter", + "name": "log_interval", + "type": { + "name": "timedelta", + "type": "reference" + } + }, + { + "defaultValue": "'table'", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1426, + "kind": 32768, + "kindString": "Parameter", + "name": "statistics_log_format", + "type": { + "name": "Literal['table', 'inline']", + "type": "reference" + } + }, + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1427, + "kind": 32768, + "kindString": "Parameter", + "name": "save_error_snapshots", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "Statistics[StatisticsState]", + "type": "reference" + }, + "inheritedFrom": { + "name": "Statistics.with_default_state", + "target": 1418, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "Statistics.with_default_state", + "target": 1418, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Indicate whether the context is active." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 3221, + "module": "statistics._statistics", + "name": "active", + "parsedDocstring": { + "text": "Indicate whether the context is active." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_statistics.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 148 + } + ], + "type": { + "name": "bool", + "type": "reference" + }, + "inheritedFrom": { + "name": "Statistics.active", + "target": 1428, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 3222, + "module": "statistics._statistics", + "name": "state", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_statistics.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 194 + } + ], + "type": { + "name": "TStatisticsState", + "type": "reference", + "target": "1392" + }, + "inheritedFrom": { + "name": "Statistics.state", + "target": 1436, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Increment the number of times a status code has been received." + } + ] + }, + "decorations": [ + { + "name": "ensure_context" + } + ], + "flags": {}, + "groups": [], + "id": 3223, + "module": "statistics._statistics", + "name": "register_status_code", + "parsedDocstring": { + "text": "Increment the number of times a status code has been received." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_statistics.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 198 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Increment the number of times a status code has been received." + } + ] + }, + "flags": {}, + "id": 1438, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "register_status_code", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1439, + "kind": 32768, + "kindString": "Parameter", + "name": "code", + "type": { + "name": "int", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "Statistics.register_status_code", + "target": 1437, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "Statistics.register_status_code", + "target": 1437, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Mark a request as started." + } + ] + }, + "decorations": [ + { + "name": "ensure_context" + } + ], + "flags": {}, + "groups": [], + "id": 3224, + "module": "statistics._statistics", + "name": "record_request_processing_start", + "parsedDocstring": { + "text": "Mark a request as started." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_statistics.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 205 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Mark a request as started." + } + ] + }, + "flags": {}, + "id": 1441, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "record_request_processing_start", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1442, + "kind": 32768, + "kindString": "Parameter", + "name": "request_id_or_key", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "Statistics.record_request_processing_start", + "target": 1440, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "Statistics.record_request_processing_start", + "target": 1440, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Mark a request as finished." + } + ] + }, + "decorations": [ + { + "name": "ensure_context" + } + ], + "flags": {}, + "groups": [], + "id": 3225, + "module": "statistics._statistics", + "name": "record_request_processing_finish", + "parsedDocstring": { + "text": "Mark a request as finished." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_statistics.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 212 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Mark a request as finished." + } + ] + }, + "flags": {}, + "id": 1444, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "record_request_processing_finish", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1445, + "kind": 32768, + "kindString": "Parameter", + "name": "request_id_or_key", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "Statistics.record_request_processing_finish", + "target": 1443, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "Statistics.record_request_processing_finish", + "target": 1443, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Mark a request as failed." + } + ] + }, + "decorations": [ + { + "name": "ensure_context" + } + ], + "flags": {}, + "groups": [], + "id": 3226, + "module": "statistics._statistics", + "name": "record_request_processing_failure", + "parsedDocstring": { + "text": "Mark a request as failed." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_statistics.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 234 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Mark a request as failed." + } + ] + }, + "flags": {}, + "id": 1447, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "record_request_processing_failure", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1448, + "kind": 32768, + "kindString": "Parameter", + "name": "request_id_or_key", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "Statistics.record_request_processing_failure", + "target": 1446, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "Statistics.record_request_processing_failure", + "target": 1446, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Calculate the current statistics." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3227, + "module": "statistics._statistics", + "name": "calculate", + "parsedDocstring": { + "text": "Calculate the current statistics." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_statistics.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 248 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Calculate the current statistics." + } + ] + }, + "flags": {}, + "id": 1450, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "calculate", + "parameters": [], + "type": { + "name": "FinalStatistics", + "type": "reference", + "target": "1453" + }, + "inheritedFrom": { + "name": "Statistics.calculate", + "target": 1449, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "Statistics.calculate", + "target": 1449, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Reset the statistics to their defaults and remove any persistent state." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3228, + "module": "statistics._statistics", + "name": "reset", + "parsedDocstring": { + "text": "Reset the statistics to their defaults and remove any persistent state." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/statistics/_statistics.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 271 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Reset the statistics to their defaults and remove any persistent state." + } + ] + }, + "flags": {}, + "id": 1452, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "reset", + "parameters": [], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "Statistics.reset", + "target": 1451, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "Statistics.reset", + "target": 1451, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Statistics compliant object that is not supposed to do anything when entering/exiting context.\n\nTo be used in sub crawlers." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 2576, + 2578, + 2574, + 3227, + 3226, + 3225, + 3224, + 3223, + 3219, + 3228, + 3220 + ], + "title": "Methods" + }, + { + "children": [ + 3221, + 3222 + ], + "title": "Properties" + } + ], + "id": 2573, + "module": "crawlers._adaptive_playwright._adaptive_playwright_crawler", + "name": "_NonPersistentStatistics", + "parsedDocstring": { + "text": "Statistics compliant object that is not supposed to do anything when entering/exiting context.\n\nTo be used in sub crawlers." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 62 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "Statistics", + "target": "1403", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance. Recommended way to create instance is to call factory methods.\n\nRecommended factory methods: `with_beautifulsoup_static_parser`, `with_parsel_static_parser`.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2584, + "module": "crawlers._adaptive_playwright._adaptive_playwright_crawler", + "name": "__init__", + "parsedDocstring": { + "text": "Initialize a new instance. Recommended way to create instance is to call factory methods.\n\nRecommended factory methods: `with_beautifulsoup_static_parser`, `with_parsel_static_parser`.\n", + "args": { + "rendering_type_predictor": "Object that implements RenderingTypePredictor and is capable of predicting which\nrendering method should be used. If None, then `DefaultRenderingTypePredictor` is used.", + "result_checker": "Function that evaluates whether crawling result is valid or not.", + "result_comparator": "Function that compares two crawling results and decides whether they are equivalent.", + "static_parser": "Implementation of `AbstractHttpParser`. Parser that will be used for static crawling.", + "static_crawler_specific_kwargs": "`AbstractHttpCrawler` only kwargs that are passed to the sub crawler.", + "playwright_crawler_specific_kwargs": "`PlaywrightCrawler` only kwargs that are passed to the sub crawler.", + "statistics": "A custom `Statistics[AdaptivePlaywrightCrawlerStatisticState]` instance, allowing the use of\nnon-default configuration.", + "kwargs": "Additional keyword arguments to pass to the underlying `BasicCrawler`." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 124 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance. Recommended way to create instance is to call factory methods.\n\nRecommended factory methods: `with_beautifulsoup_static_parser`, `with_parsel_static_parser`.\n" + } + ] + }, + "flags": {}, + "id": 2585, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Implementation of `AbstractHttpParser`. Parser that will be used for static crawling." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2586, + "kind": 32768, + "kindString": "Parameter", + "name": "static_parser", + "type": { + "name": "AbstractHttpParser", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "TStaticParseResult", + "target": "2528" + }, + { + "type": "reference", + "name": "TStaticSelectResult", + "target": "2529" + } + ], + "target": "2642" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Object that implements RenderingTypePredictor and is capable of predicting which\nrendering method should be used. If None, then `DefaultRenderingTypePredictor` is used." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2587, + "kind": 32768, + "kindString": "Parameter", + "name": "rendering_type_predictor", + "type": { + "name": "RenderingTypePredictor | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "RenderingTypePredictor", + "target": "2502" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Function that evaluates whether crawling result is valid or not." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2588, + "kind": 32768, + "kindString": "Parameter", + "name": "result_checker", + "type": { + "name": "Callable[[RequestHandlerRunResult], bool] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Callable", + "typeArguments": [ + { + "type": "reference", + "name": "[RequestHandlerRunResult]" + }, + { + "type": "reference", + "name": "bool" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Function that compares two crawling results and decides whether they are equivalent." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2589, + "kind": 32768, + "kindString": "Parameter", + "name": "result_comparator", + "type": { + "name": "Callable[[RequestHandlerRunResult, RequestHandlerRunResult], bool] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Callable", + "typeArguments": [ + { + "type": "reference", + "name": "[RequestHandlerRunResult, RequestHandlerRunResult]" + }, + { + "type": "reference", + "name": "bool" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "`PlaywrightCrawler` only kwargs that are passed to the sub crawler." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2590, + "kind": 32768, + "kindString": "Parameter", + "name": "playwright_crawler_specific_kwargs", + "type": { + "name": "_PlaywrightCrawlerAdditionalOptions | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "_PlaywrightCrawlerAdditionalOptions", + "target": "2237" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A custom `Statistics[AdaptivePlaywrightCrawlerStatisticState]` instance, allowing the use of\nnon-default configuration." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2591, + "kind": 32768, + "kindString": "Parameter", + "name": "statistics", + "type": { + "name": "Statistics[AdaptivePlaywrightCrawlerStatisticState] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Statistics", + "typeArguments": [ + { + "type": "reference", + "name": "AdaptivePlaywrightCrawlerStatisticState", + "target": "2565" + } + ], + "target": "1403" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The `Configuration` instance. Some of its properties are used as defaults for the crawler." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 2368, + "module": "crawlers._basic._basic_crawler", + "name": "configuration", + "parsedDocstring": { + "text": "The `Configuration` instance. Some of its properties are used as defaults for the crawler." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 101 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Configuration", + "target": "93" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The event manager for managing events for the crawler and all its components." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 2369, + "module": "crawlers._basic._basic_crawler", + "name": "event_manager", + "parsedDocstring": { + "text": "The event manager for managing events for the crawler and all its components." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 104 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "EventManager", + "target": "2086" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The storage client for managing storages for the crawler and all its components." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 2370, + "module": "crawlers._basic._basic_crawler", + "name": "storage_client", + "parsedDocstring": { + "text": "The storage client for managing storages for the crawler and all its components." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 107 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "StorageClient", + "target": "1189" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Manager of requests that should be processed by the crawler." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 2371, + "module": "crawlers._basic._basic_crawler", + "name": "request_manager", + "parsedDocstring": { + "text": "Manager of requests that should be processed by the crawler." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 110 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "RequestManager", + "target": "1752" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A custom `SessionPool` instance, allowing the use of non-default configuration." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 2372, + "module": "crawlers._basic._basic_crawler", + "name": "session_pool", + "parsedDocstring": { + "text": "A custom `SessionPool` instance, allowing the use of non-default configuration." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 113 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "SessionPool", + "target": "1534" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP proxy configuration used when making requests." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 2373, + "module": "crawlers._basic._basic_crawler", + "name": "proxy_configuration", + "parsedDocstring": { + "text": "HTTP proxy configuration used when making requests." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 116 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "ProxyConfiguration", + "target": "24" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP client used by `BasicCrawlingContext.send_request` method." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 2374, + "module": "crawlers._basic._basic_crawler", + "name": "http_client", + "parsedDocstring": { + "text": "HTTP client used by `BasicCrawlingContext.send_request` method." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 119 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "HttpClient", + "target": "1919" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Specifies the maximum number of retries allowed for a request if its processing fails.\nThis includes retries due to navigation errors or errors thrown from user-supplied functions\n(`request_handler`, `pre_navigation_hooks` etc.).\n\nThis limit does not apply to retries triggered by session rotation (see `max_session_rotations`)." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 2375, + "module": "crawlers._basic._basic_crawler", + "name": "max_request_retries", + "parsedDocstring": { + "text": "Specifies the maximum number of retries allowed for a request if its processing fails.\nThis includes retries due to navigation errors or errors thrown from user-supplied functions\n(`request_handler`, `pre_navigation_hooks` etc.).\n\nThis limit does not apply to retries triggered by session rotation (see `max_session_rotations`)." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 122 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum number of pages to open during a crawl. The crawl stops upon reaching this limit.\nSetting this value can help avoid infinite loops in misconfigured crawlers. `None` means no limit.\nDue to concurrency settings, the actual number of pages visited may slightly exceed this value." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 2376, + "module": "crawlers._basic._basic_crawler", + "name": "max_requests_per_crawl", + "parsedDocstring": { + "text": "Maximum number of pages to open during a crawl. The crawl stops upon reaching this limit.\nSetting this value can help avoid infinite loops in misconfigured crawlers. `None` means no limit.\nDue to concurrency settings, the actual number of pages visited may slightly exceed this value." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 129 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum number of session rotations per request. The crawler rotates the session if a proxy error occurs\nor if the website blocks the request.\n\nThe session rotations are not counted towards the `max_request_retries` limit." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 2377, + "module": "crawlers._basic._basic_crawler", + "name": "max_session_rotations", + "parsedDocstring": { + "text": "Maximum number of session rotations per request. The crawler rotates the session if a proxy error occurs\nor if the website blocks the request.\n\nThe session rotations are not counted towards the `max_request_retries` limit." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 134 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Specifies the maximum crawl depth. If set, the crawler will stop processing links beyond this depth.\nThe crawl depth starts at 0 for initial requests and increases with each subsequent level of links.\nRequests at the maximum depth will still be processed, but no new links will be enqueued from those requests.\nIf not set, crawling continues without depth restrictions." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 2378, + "module": "crawlers._basic._basic_crawler", + "name": "max_crawl_depth", + "parsedDocstring": { + "text": "Specifies the maximum crawl depth. If set, the crawler will stop processing links beyond this depth.\nThe crawl depth starts at 0 for initial requests and increases with each subsequent level of links.\nRequests at the maximum depth will still be processed, but no new links will be enqueued from those requests.\nIf not set, crawling continues without depth restrictions." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 141 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Enable the use of a session pool for managing sessions during crawling." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 2379, + "module": "crawlers._basic._basic_crawler", + "name": "use_session_pool", + "parsedDocstring": { + "text": "Enable the use of a session pool for managing sessions during crawling." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 148 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, the crawler attempts to bypass bot protections automatically." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 2380, + "module": "crawlers._basic._basic_crawler", + "name": "retry_on_blocked", + "parsedDocstring": { + "text": "If True, the crawler attempts to bypass bot protections automatically." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 151 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Settings to fine-tune concurrency levels." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 2381, + "module": "crawlers._basic._basic_crawler", + "name": "concurrency_settings", + "parsedDocstring": { + "text": "Settings to fine-tune concurrency levels." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 154 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "ConcurrencySettings", + "target": "146" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum duration allowed for a single request handler to run." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 2382, + "module": "crawlers._basic._basic_crawler", + "name": "request_handler_timeout", + "parsedDocstring": { + "text": "Maximum duration allowed for a single request handler to run." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 157 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "timedelta" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, the crawler stops immediately when any request handler error occurs." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 2383, + "module": "crawlers._basic._basic_crawler", + "name": "abort_on_error", + "parsedDocstring": { + "text": "If True, the crawler stops immediately when any request handler error occurs." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 160 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, the crawler will set up logging infrastructure automatically." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 2384, + "module": "crawlers._basic._basic_crawler", + "name": "configure_logging", + "parsedDocstring": { + "text": "If True, the crawler will set up logging infrastructure automatically." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 163 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If 'table', displays crawler statistics as formatted tables in logs. If 'inline', outputs statistics as plain\ntext log messages." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 2385, + "module": "crawlers._basic._basic_crawler", + "name": "statistics_log_format", + "parsedDocstring": { + "text": "If 'table', displays crawler statistics as formatted tables in logs. If 'inline', outputs statistics as plain\ntext log messages." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 166 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Literal", + "typeArguments": [ + { + "type": "literal", + "value": "table" + }, + { + "type": "literal", + "value": "inline" + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Flag that can keep crawler running even when there are no requests in queue." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 2386, + "module": "crawlers._basic._basic_crawler", + "name": "keep_alive", + "parsedDocstring": { + "text": "Flag that can keep crawler running even when there are no requests in queue." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 171 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Additional HTTP status codes to treat as errors, triggering automatic retries when encountered." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 2387, + "module": "crawlers._basic._basic_crawler", + "name": "additional_http_error_status_codes", + "parsedDocstring": { + "text": "Additional HTTP status codes to treat as errors, triggering automatic retries when encountered." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 174 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Iterable", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP status codes that are typically considered errors but should be treated as successful responses." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 2388, + "module": "crawlers._basic._basic_crawler", + "name": "ignore_http_error_status_codes", + "parsedDocstring": { + "text": "HTTP status codes that are typically considered errors but should be treated as successful responses." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 177 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Iterable", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If set to `True`, the crawler will automatically try to fetch the robots.txt file for each domain,\nand skip those that are not allowed. This also prevents disallowed URLs to be added via `EnqueueLinksFunction`." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 2389, + "module": "crawlers._basic._basic_crawler", + "name": "respect_robots_txt_file", + "parsedDocstring": { + "text": "If set to `True`, the crawler will automatically try to fetch the robots.txt file for each domain,\nand skip those that are not allowed. This also prevents disallowed URLs to be added via `EnqueueLinksFunction`." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 188 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "BasicCrawler.__init__", + "target": 2395, + "type": "reference" + } + } + ], + "overwrites": { + "name": "BasicCrawler.__init__", + "target": 2395, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create `AdaptivePlaywrightCrawler` that uses `BeautifulSoup` for parsing static content." + } + ] + }, + "decorations": [ + { + "name": "classmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2593, + "module": "crawlers._adaptive_playwright._adaptive_playwright_crawler", + "name": "with_beautifulsoup_static_parser", + "parsedDocstring": { + "text": "Create `AdaptivePlaywrightCrawler` that uses `BeautifulSoup` for parsing static content." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 218 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create `AdaptivePlaywrightCrawler` that uses `BeautifulSoup` for parsing static content." + } + ] + }, + "flags": {}, + "id": 2594, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "with_beautifulsoup_static_parser", + "parameters": [ + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 2595, + "kind": 32768, + "kindString": "Parameter", + "name": "rendering_type_predictor", + "type": { + "name": "RenderingTypePredictor | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "RenderingTypePredictor", + "target": "2502" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 2596, + "kind": 32768, + "kindString": "Parameter", + "name": "result_checker", + "type": { + "name": "Callable[[RequestHandlerRunResult], bool] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Callable", + "typeArguments": [ + { + "type": "reference", + "name": "[RequestHandlerRunResult]" + }, + { + "type": "reference", + "name": "bool" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 2597, + "kind": 32768, + "kindString": "Parameter", + "name": "result_comparator", + "type": { + "name": "Callable[[RequestHandlerRunResult, RequestHandlerRunResult], bool] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Callable", + "typeArguments": [ + { + "type": "reference", + "name": "[RequestHandlerRunResult, RequestHandlerRunResult]" + }, + { + "type": "reference", + "name": "bool" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "'lxml'", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 2598, + "kind": 32768, + "kindString": "Parameter", + "name": "parser_type", + "type": { + "name": "BeautifulSoupParserType", + "type": "reference", + "target": "2328" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 2599, + "kind": 32768, + "kindString": "Parameter", + "name": "playwright_crawler_specific_kwargs", + "type": { + "name": "_PlaywrightCrawlerAdditionalOptions | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "_PlaywrightCrawlerAdditionalOptions", + "target": "2237" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 2600, + "kind": 32768, + "kindString": "Parameter", + "name": "statistics", + "type": { + "name": "Statistics[StatisticsState] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Statistics", + "typeArguments": [ + { + "type": "reference", + "name": "StatisticsState", + "target": "1470" + } + ], + "target": "1403" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The `Configuration` instance. Some of its properties are used as defaults for the crawler." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 2368, + "module": "crawlers._basic._basic_crawler", + "name": "configuration", + "parsedDocstring": { + "text": "The `Configuration` instance. Some of its properties are used as defaults for the crawler." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 101 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Configuration", + "target": "93" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The event manager for managing events for the crawler and all its components." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 2369, + "module": "crawlers._basic._basic_crawler", + "name": "event_manager", + "parsedDocstring": { + "text": "The event manager for managing events for the crawler and all its components." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 104 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "EventManager", + "target": "2086" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The storage client for managing storages for the crawler and all its components." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 2370, + "module": "crawlers._basic._basic_crawler", + "name": "storage_client", + "parsedDocstring": { + "text": "The storage client for managing storages for the crawler and all its components." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 107 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "StorageClient", + "target": "1189" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Manager of requests that should be processed by the crawler." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 2371, + "module": "crawlers._basic._basic_crawler", + "name": "request_manager", + "parsedDocstring": { + "text": "Manager of requests that should be processed by the crawler." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 110 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "RequestManager", + "target": "1752" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A custom `SessionPool` instance, allowing the use of non-default configuration." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 2372, + "module": "crawlers._basic._basic_crawler", + "name": "session_pool", + "parsedDocstring": { + "text": "A custom `SessionPool` instance, allowing the use of non-default configuration." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 113 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "SessionPool", + "target": "1534" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP proxy configuration used when making requests." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 2373, + "module": "crawlers._basic._basic_crawler", + "name": "proxy_configuration", + "parsedDocstring": { + "text": "HTTP proxy configuration used when making requests." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 116 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "ProxyConfiguration", + "target": "24" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP client used by `BasicCrawlingContext.send_request` method." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 2374, + "module": "crawlers._basic._basic_crawler", + "name": "http_client", + "parsedDocstring": { + "text": "HTTP client used by `BasicCrawlingContext.send_request` method." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 119 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "HttpClient", + "target": "1919" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Specifies the maximum number of retries allowed for a request if its processing fails.\nThis includes retries due to navigation errors or errors thrown from user-supplied functions\n(`request_handler`, `pre_navigation_hooks` etc.).\n\nThis limit does not apply to retries triggered by session rotation (see `max_session_rotations`)." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 2375, + "module": "crawlers._basic._basic_crawler", + "name": "max_request_retries", + "parsedDocstring": { + "text": "Specifies the maximum number of retries allowed for a request if its processing fails.\nThis includes retries due to navigation errors or errors thrown from user-supplied functions\n(`request_handler`, `pre_navigation_hooks` etc.).\n\nThis limit does not apply to retries triggered by session rotation (see `max_session_rotations`)." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 122 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum number of pages to open during a crawl. The crawl stops upon reaching this limit.\nSetting this value can help avoid infinite loops in misconfigured crawlers. `None` means no limit.\nDue to concurrency settings, the actual number of pages visited may slightly exceed this value." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 2376, + "module": "crawlers._basic._basic_crawler", + "name": "max_requests_per_crawl", + "parsedDocstring": { + "text": "Maximum number of pages to open during a crawl. The crawl stops upon reaching this limit.\nSetting this value can help avoid infinite loops in misconfigured crawlers. `None` means no limit.\nDue to concurrency settings, the actual number of pages visited may slightly exceed this value." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 129 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum number of session rotations per request. The crawler rotates the session if a proxy error occurs\nor if the website blocks the request.\n\nThe session rotations are not counted towards the `max_request_retries` limit." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 2377, + "module": "crawlers._basic._basic_crawler", + "name": "max_session_rotations", + "parsedDocstring": { + "text": "Maximum number of session rotations per request. The crawler rotates the session if a proxy error occurs\nor if the website blocks the request.\n\nThe session rotations are not counted towards the `max_request_retries` limit." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 134 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Specifies the maximum crawl depth. If set, the crawler will stop processing links beyond this depth.\nThe crawl depth starts at 0 for initial requests and increases with each subsequent level of links.\nRequests at the maximum depth will still be processed, but no new links will be enqueued from those requests.\nIf not set, crawling continues without depth restrictions." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 2378, + "module": "crawlers._basic._basic_crawler", + "name": "max_crawl_depth", + "parsedDocstring": { + "text": "Specifies the maximum crawl depth. If set, the crawler will stop processing links beyond this depth.\nThe crawl depth starts at 0 for initial requests and increases with each subsequent level of links.\nRequests at the maximum depth will still be processed, but no new links will be enqueued from those requests.\nIf not set, crawling continues without depth restrictions." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 141 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Enable the use of a session pool for managing sessions during crawling." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 2379, + "module": "crawlers._basic._basic_crawler", + "name": "use_session_pool", + "parsedDocstring": { + "text": "Enable the use of a session pool for managing sessions during crawling." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 148 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, the crawler attempts to bypass bot protections automatically." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 2380, + "module": "crawlers._basic._basic_crawler", + "name": "retry_on_blocked", + "parsedDocstring": { + "text": "If True, the crawler attempts to bypass bot protections automatically." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 151 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Settings to fine-tune concurrency levels." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 2381, + "module": "crawlers._basic._basic_crawler", + "name": "concurrency_settings", + "parsedDocstring": { + "text": "Settings to fine-tune concurrency levels." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 154 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "ConcurrencySettings", + "target": "146" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum duration allowed for a single request handler to run." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 2382, + "module": "crawlers._basic._basic_crawler", + "name": "request_handler_timeout", + "parsedDocstring": { + "text": "Maximum duration allowed for a single request handler to run." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 157 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "timedelta" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, the crawler stops immediately when any request handler error occurs." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 2383, + "module": "crawlers._basic._basic_crawler", + "name": "abort_on_error", + "parsedDocstring": { + "text": "If True, the crawler stops immediately when any request handler error occurs." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 160 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, the crawler will set up logging infrastructure automatically." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 2384, + "module": "crawlers._basic._basic_crawler", + "name": "configure_logging", + "parsedDocstring": { + "text": "If True, the crawler will set up logging infrastructure automatically." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 163 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If 'table', displays crawler statistics as formatted tables in logs. If 'inline', outputs statistics as plain\ntext log messages." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 2385, + "module": "crawlers._basic._basic_crawler", + "name": "statistics_log_format", + "parsedDocstring": { + "text": "If 'table', displays crawler statistics as formatted tables in logs. If 'inline', outputs statistics as plain\ntext log messages." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 166 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Literal", + "typeArguments": [ + { + "type": "literal", + "value": "table" + }, + { + "type": "literal", + "value": "inline" + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Flag that can keep crawler running even when there are no requests in queue." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 2386, + "module": "crawlers._basic._basic_crawler", + "name": "keep_alive", + "parsedDocstring": { + "text": "Flag that can keep crawler running even when there are no requests in queue." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 171 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Additional HTTP status codes to treat as errors, triggering automatic retries when encountered." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 2387, + "module": "crawlers._basic._basic_crawler", + "name": "additional_http_error_status_codes", + "parsedDocstring": { + "text": "Additional HTTP status codes to treat as errors, triggering automatic retries when encountered." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 174 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Iterable", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP status codes that are typically considered errors but should be treated as successful responses." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 2388, + "module": "crawlers._basic._basic_crawler", + "name": "ignore_http_error_status_codes", + "parsedDocstring": { + "text": "HTTP status codes that are typically considered errors but should be treated as successful responses." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 177 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Iterable", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If set to `True`, the crawler will automatically try to fetch the robots.txt file for each domain,\nand skip those that are not allowed. This also prevents disallowed URLs to be added via `EnqueueLinksFunction`." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 2389, + "module": "crawlers._basic._basic_crawler", + "name": "respect_robots_txt_file", + "parsedDocstring": { + "text": "If set to `True`, the crawler will automatically try to fetch the robots.txt file for each domain,\nand skip those that are not allowed. This also prevents disallowed URLs to be added via `EnqueueLinksFunction`." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 188 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + } + ], + "type": { + "name": "AdaptivePlaywrightCrawler", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "ParsedHttpCrawlingContext", + "typeArguments": [ + { + "type": "reference", + "name": "BeautifulSoup" + } + ], + "target": "2632" + }, + { + "type": "reference", + "name": "BeautifulSoup" + }, + { + "type": "reference", + "name": "Tag" + } + ], + "target": "2583" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create `AdaptivePlaywrightCrawler` that uses `Parcel` for parsing static content." + } + ] + }, + "decorations": [ + { + "name": "classmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2602, + "module": "crawlers._adaptive_playwright._adaptive_playwright_crawler", + "name": "with_parsel_static_parser", + "parsedDocstring": { + "text": "Create `AdaptivePlaywrightCrawler` that uses `Parcel` for parsing static content." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 244 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create `AdaptivePlaywrightCrawler` that uses `Parcel` for parsing static content." + } + ] + }, + "flags": {}, + "id": 2603, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "with_parsel_static_parser", + "parameters": [ + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 2604, + "kind": 32768, + "kindString": "Parameter", + "name": "rendering_type_predictor", + "type": { + "name": "RenderingTypePredictor | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "RenderingTypePredictor", + "target": "2502" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 2605, + "kind": 32768, + "kindString": "Parameter", + "name": "result_checker", + "type": { + "name": "Callable[[RequestHandlerRunResult], bool] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Callable", + "typeArguments": [ + { + "type": "reference", + "name": "[RequestHandlerRunResult]" + }, + { + "type": "reference", + "name": "bool" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 2606, + "kind": 32768, + "kindString": "Parameter", + "name": "result_comparator", + "type": { + "name": "Callable[[RequestHandlerRunResult, RequestHandlerRunResult], bool] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Callable", + "typeArguments": [ + { + "type": "reference", + "name": "[RequestHandlerRunResult, RequestHandlerRunResult]" + }, + { + "type": "reference", + "name": "bool" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 2607, + "kind": 32768, + "kindString": "Parameter", + "name": "playwright_crawler_specific_kwargs", + "type": { + "name": "_PlaywrightCrawlerAdditionalOptions | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "_PlaywrightCrawlerAdditionalOptions", + "target": "2237" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 2608, + "kind": 32768, + "kindString": "Parameter", + "name": "statistics", + "type": { + "name": "Statistics[StatisticsState] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Statistics", + "typeArguments": [ + { + "type": "reference", + "name": "StatisticsState", + "target": "1470" + } + ], + "target": "1403" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The `Configuration` instance. Some of its properties are used as defaults for the crawler." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 2368, + "module": "crawlers._basic._basic_crawler", + "name": "configuration", + "parsedDocstring": { + "text": "The `Configuration` instance. Some of its properties are used as defaults for the crawler." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 101 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Configuration", + "target": "93" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The event manager for managing events for the crawler and all its components." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 2369, + "module": "crawlers._basic._basic_crawler", + "name": "event_manager", + "parsedDocstring": { + "text": "The event manager for managing events for the crawler and all its components." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 104 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "EventManager", + "target": "2086" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The storage client for managing storages for the crawler and all its components." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 2370, + "module": "crawlers._basic._basic_crawler", + "name": "storage_client", + "parsedDocstring": { + "text": "The storage client for managing storages for the crawler and all its components." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 107 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "StorageClient", + "target": "1189" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Manager of requests that should be processed by the crawler." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 2371, + "module": "crawlers._basic._basic_crawler", + "name": "request_manager", + "parsedDocstring": { + "text": "Manager of requests that should be processed by the crawler." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 110 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "RequestManager", + "target": "1752" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A custom `SessionPool` instance, allowing the use of non-default configuration." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 2372, + "module": "crawlers._basic._basic_crawler", + "name": "session_pool", + "parsedDocstring": { + "text": "A custom `SessionPool` instance, allowing the use of non-default configuration." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 113 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "SessionPool", + "target": "1534" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP proxy configuration used when making requests." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 2373, + "module": "crawlers._basic._basic_crawler", + "name": "proxy_configuration", + "parsedDocstring": { + "text": "HTTP proxy configuration used when making requests." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 116 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "ProxyConfiguration", + "target": "24" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP client used by `BasicCrawlingContext.send_request` method." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 2374, + "module": "crawlers._basic._basic_crawler", + "name": "http_client", + "parsedDocstring": { + "text": "HTTP client used by `BasicCrawlingContext.send_request` method." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 119 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "HttpClient", + "target": "1919" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Specifies the maximum number of retries allowed for a request if its processing fails.\nThis includes retries due to navigation errors or errors thrown from user-supplied functions\n(`request_handler`, `pre_navigation_hooks` etc.).\n\nThis limit does not apply to retries triggered by session rotation (see `max_session_rotations`)." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 2375, + "module": "crawlers._basic._basic_crawler", + "name": "max_request_retries", + "parsedDocstring": { + "text": "Specifies the maximum number of retries allowed for a request if its processing fails.\nThis includes retries due to navigation errors or errors thrown from user-supplied functions\n(`request_handler`, `pre_navigation_hooks` etc.).\n\nThis limit does not apply to retries triggered by session rotation (see `max_session_rotations`)." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 122 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum number of pages to open during a crawl. The crawl stops upon reaching this limit.\nSetting this value can help avoid infinite loops in misconfigured crawlers. `None` means no limit.\nDue to concurrency settings, the actual number of pages visited may slightly exceed this value." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 2376, + "module": "crawlers._basic._basic_crawler", + "name": "max_requests_per_crawl", + "parsedDocstring": { + "text": "Maximum number of pages to open during a crawl. The crawl stops upon reaching this limit.\nSetting this value can help avoid infinite loops in misconfigured crawlers. `None` means no limit.\nDue to concurrency settings, the actual number of pages visited may slightly exceed this value." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 129 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum number of session rotations per request. The crawler rotates the session if a proxy error occurs\nor if the website blocks the request.\n\nThe session rotations are not counted towards the `max_request_retries` limit." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 2377, + "module": "crawlers._basic._basic_crawler", + "name": "max_session_rotations", + "parsedDocstring": { + "text": "Maximum number of session rotations per request. The crawler rotates the session if a proxy error occurs\nor if the website blocks the request.\n\nThe session rotations are not counted towards the `max_request_retries` limit." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 134 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Specifies the maximum crawl depth. If set, the crawler will stop processing links beyond this depth.\nThe crawl depth starts at 0 for initial requests and increases with each subsequent level of links.\nRequests at the maximum depth will still be processed, but no new links will be enqueued from those requests.\nIf not set, crawling continues without depth restrictions." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 2378, + "module": "crawlers._basic._basic_crawler", + "name": "max_crawl_depth", + "parsedDocstring": { + "text": "Specifies the maximum crawl depth. If set, the crawler will stop processing links beyond this depth.\nThe crawl depth starts at 0 for initial requests and increases with each subsequent level of links.\nRequests at the maximum depth will still be processed, but no new links will be enqueued from those requests.\nIf not set, crawling continues without depth restrictions." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 141 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Enable the use of a session pool for managing sessions during crawling." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 2379, + "module": "crawlers._basic._basic_crawler", + "name": "use_session_pool", + "parsedDocstring": { + "text": "Enable the use of a session pool for managing sessions during crawling." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 148 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, the crawler attempts to bypass bot protections automatically." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 2380, + "module": "crawlers._basic._basic_crawler", + "name": "retry_on_blocked", + "parsedDocstring": { + "text": "If True, the crawler attempts to bypass bot protections automatically." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 151 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Settings to fine-tune concurrency levels." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 2381, + "module": "crawlers._basic._basic_crawler", + "name": "concurrency_settings", + "parsedDocstring": { + "text": "Settings to fine-tune concurrency levels." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 154 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "ConcurrencySettings", + "target": "146" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum duration allowed for a single request handler to run." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 2382, + "module": "crawlers._basic._basic_crawler", + "name": "request_handler_timeout", + "parsedDocstring": { + "text": "Maximum duration allowed for a single request handler to run." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 157 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "timedelta" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, the crawler stops immediately when any request handler error occurs." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 2383, + "module": "crawlers._basic._basic_crawler", + "name": "abort_on_error", + "parsedDocstring": { + "text": "If True, the crawler stops immediately when any request handler error occurs." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 160 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, the crawler will set up logging infrastructure automatically." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 2384, + "module": "crawlers._basic._basic_crawler", + "name": "configure_logging", + "parsedDocstring": { + "text": "If True, the crawler will set up logging infrastructure automatically." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 163 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If 'table', displays crawler statistics as formatted tables in logs. If 'inline', outputs statistics as plain\ntext log messages." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 2385, + "module": "crawlers._basic._basic_crawler", + "name": "statistics_log_format", + "parsedDocstring": { + "text": "If 'table', displays crawler statistics as formatted tables in logs. If 'inline', outputs statistics as plain\ntext log messages." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 166 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Literal", + "typeArguments": [ + { + "type": "literal", + "value": "table" + }, + { + "type": "literal", + "value": "inline" + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Flag that can keep crawler running even when there are no requests in queue." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 2386, + "module": "crawlers._basic._basic_crawler", + "name": "keep_alive", + "parsedDocstring": { + "text": "Flag that can keep crawler running even when there are no requests in queue." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 171 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Additional HTTP status codes to treat as errors, triggering automatic retries when encountered." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 2387, + "module": "crawlers._basic._basic_crawler", + "name": "additional_http_error_status_codes", + "parsedDocstring": { + "text": "Additional HTTP status codes to treat as errors, triggering automatic retries when encountered." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 174 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Iterable", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP status codes that are typically considered errors but should be treated as successful responses." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 2388, + "module": "crawlers._basic._basic_crawler", + "name": "ignore_http_error_status_codes", + "parsedDocstring": { + "text": "HTTP status codes that are typically considered errors but should be treated as successful responses." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 177 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Iterable", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If set to `True`, the crawler will automatically try to fetch the robots.txt file for each domain,\nand skip those that are not allowed. This also prevents disallowed URLs to be added via `EnqueueLinksFunction`." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 2389, + "module": "crawlers._basic._basic_crawler", + "name": "respect_robots_txt_file", + "parsedDocstring": { + "text": "If set to `True`, the crawler will automatically try to fetch the robots.txt file for each domain,\nand skip those that are not allowed. This also prevents disallowed URLs to be added via `EnqueueLinksFunction`." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 188 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + } + ], + "type": { + "name": "AdaptivePlaywrightCrawler", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "ParsedHttpCrawlingContext", + "typeArguments": [ + { + "type": "reference", + "name": "Selector" + } + ], + "target": "2632" + }, + { + "type": "reference", + "name": "Selector" + }, + { + "type": "reference", + "name": "Selector" + } + ], + "target": "2583" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Pre navigation hooks for adaptive crawler are delegated to sub crawlers.\n\nOptionally parametrized decorator.\nHooks are wrapped in context that handles possibly missing `page` object by raising `AdaptiveContextError`." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2610, + "module": "crawlers._adaptive_playwright._adaptive_playwright_crawler", + "name": "pre_navigation_hook", + "parsedDocstring": { + "text": "Pre navigation hooks for adaptive crawler are delegated to sub crawlers.\n\nOptionally parametrized decorator.\nHooks are wrapped in context that handles possibly missing `page` object by raising `AdaptiveContextError`." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 418 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Pre navigation hooks for adaptive crawler are delegated to sub crawlers.\n\nOptionally parametrized decorator.\nHooks are wrapped in context that handles possibly missing `page` object by raising `AdaptiveContextError`." + } + ] + }, + "flags": {}, + "id": 2611, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "pre_navigation_hook", + "parameters": [ + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 2612, + "kind": 32768, + "kindString": "Parameter", + "name": "hook", + "type": { + "name": "Callable[[AdaptivePlaywrightPreNavCrawlingContext], Awaitable[None]] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Callable", + "typeArguments": [ + { + "type": "reference", + "name": "[AdaptivePlaywrightPreNavCrawlingContext]" + }, + { + "type": "reference", + "name": "Awaitable", + "typeArguments": [ + { + "type": "literal", + "value": null + } + ] + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2613, + "kind": 32768, + "kindString": "Parameter", + "name": "playwright_only", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "Callable", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "[Callable[[AdaptivePlaywrightPreNavCrawlingContext], Awaitable[None]]]" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2614, + "module": "crawlers._adaptive_playwright._adaptive_playwright_crawler", + "name": "track_http_only_request_handler_runs", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 443 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 2615, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "track_http_only_request_handler_runs", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2616, + "module": "crawlers._adaptive_playwright._adaptive_playwright_crawler", + "name": "track_browser_request_handler_runs", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 446 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 2617, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "track_browser_request_handler_runs", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2618, + "module": "crawlers._adaptive_playwright._adaptive_playwright_crawler", + "name": "track_rendering_type_mispredictions", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 449 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 2619, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "track_rendering_type_mispredictions", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The logger used by the crawler." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 3323, + "module": "crawlers._basic._basic_crawler", + "name": "log", + "parsedDocstring": { + "text": "The logger used by the crawler." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 440 + } + ], + "type": { + "name": "logging.Logger", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.log", + "target": 2424, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The `Router` used to handle each individual crawling request." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 3324, + "module": "crawlers._basic._basic_crawler", + "name": "router", + "parsedDocstring": { + "text": "The `Router` used to handle each individual crawling request." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 445 + } + ], + "type": { + "name": "Router[TCrawlingContext]", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.router", + "target": 2425, + "type": "reference" + }, + "overwrites": { + "name": "BasicCrawler.router", + "target": 2426, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Statistics about the current (or last) crawler run." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 3325, + "module": "crawlers._basic._basic_crawler", + "name": "statistics", + "parsedDocstring": { + "text": "Statistics about the current (or last) crawler run." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 460 + } + ], + "type": { + "name": "Statistics[TStatisticsState]", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.statistics", + "target": 2429, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Set flag to stop crawler.\n\nThis stops current crawler run regardless of whether all requests were finished.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3326, + "module": "crawlers._basic._basic_crawler", + "name": "stop", + "parsedDocstring": { + "text": "Set flag to stop crawler.\n\nThis stops current crawler run regardless of whether all requests were finished.\n", + "args": { + "reason": "Reason for stopping that will be used in logs." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 464 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Set flag to stop crawler.\n\nThis stops current crawler run regardless of whether all requests were finished.\n" + } + ] + }, + "flags": {}, + "id": 2431, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "stop", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Reason for stopping that will be used in logs." + } + ] + }, + "defaultValue": "'Stop was called externally.'", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 2432, + "kind": 32768, + "kindString": "Parameter", + "name": "reason", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.stop", + "target": 2430, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.stop", + "target": 2430, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the configured request manager. If none is configured, open and return the default request queue." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3327, + "module": "crawlers._basic._basic_crawler", + "name": "get_request_manager", + "parsedDocstring": { + "text": "Return the configured request manager. If none is configured, open and return the default request queue." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 524 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the configured request manager. If none is configured, open and return the default request queue." + } + ] + }, + "flags": {}, + "id": 2434, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_request_manager", + "parameters": [], + "type": { + "name": "RequestManager", + "type": "reference", + "target": "1752" + }, + "inheritedFrom": { + "name": "BasicCrawler.get_request_manager", + "target": 2433, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.get_request_manager", + "target": 2433, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the `Dataset` with the given ID or name. If none is provided, return the default one." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3328, + "module": "crawlers._basic._basic_crawler", + "name": "get_dataset", + "parsedDocstring": { + "text": "Return the `Dataset` with the given ID or name. If none is provided, return the default one." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 531 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the `Dataset` with the given ID or name. If none is provided, return the default one." + } + ] + }, + "flags": {}, + "id": 2436, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_dataset", + "parameters": [ + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2437, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2438, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str | None", + "type": "reference" + } + } + ], + "type": { + "name": "Dataset", + "type": "reference", + "target": "662" + }, + "inheritedFrom": { + "name": "BasicCrawler.get_dataset", + "target": 2435, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.get_dataset", + "target": 2435, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the `KeyValueStore` with the given ID or name. If none is provided, return the default KVS." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3329, + "module": "crawlers._basic._basic_crawler", + "name": "get_key_value_store", + "parsedDocstring": { + "text": "Return the `KeyValueStore` with the given ID or name. If none is provided, return the default KVS." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 540 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the `KeyValueStore` with the given ID or name. If none is provided, return the default KVS." + } + ] + }, + "flags": {}, + "id": 2440, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_key_value_store", + "parameters": [ + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2441, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2442, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str | None", + "type": "reference" + } + } + ], + "type": { + "name": "KeyValueStore", + "type": "reference", + "target": "569" + }, + "inheritedFrom": { + "name": "BasicCrawler.get_key_value_store", + "target": 2439, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.get_key_value_store", + "target": 2439, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a function to handle errors occurring in request handlers.\n\nThe error handler is invoked after a request handler error occurs and before a retry attempt." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3330, + "module": "crawlers._basic._basic_crawler", + "name": "error_handler", + "parsedDocstring": { + "text": "Register a function to handle errors occurring in request handlers.\n\nThe error handler is invoked after a request handler error occurs and before a retry attempt." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 549 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a function to handle errors occurring in request handlers.\n\nThe error handler is invoked after a request handler error occurs and before a retry attempt." + } + ] + }, + "flags": {}, + "id": 2444, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "error_handler", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2445, + "kind": 32768, + "kindString": "Parameter", + "name": "handler", + "type": { + "name": "ErrorHandler[TCrawlingContext | BasicCrawlingContext]", + "type": "reference" + } + } + ], + "type": { + "name": "ErrorHandler[TCrawlingContext]", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.error_handler", + "target": 2443, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.error_handler", + "target": 2443, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a function to handle requests that exceed the maximum retry limit.\n\nThe failed request handler is invoked when a request has failed all retry attempts." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3331, + "module": "crawlers._basic._basic_crawler", + "name": "failed_request_handler", + "parsedDocstring": { + "text": "Register a function to handle requests that exceed the maximum retry limit.\n\nThe failed request handler is invoked when a request has failed all retry attempts." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 559 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a function to handle requests that exceed the maximum retry limit.\n\nThe failed request handler is invoked when a request has failed all retry attempts." + } + ] + }, + "flags": {}, + "id": 2447, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "failed_request_handler", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2448, + "kind": 32768, + "kindString": "Parameter", + "name": "handler", + "type": { + "name": "FailedRequestHandler[TCrawlingContext | BasicCrawlingContext]", + "type": "reference" + } + } + ], + "type": { + "name": "FailedRequestHandler[TCrawlingContext]", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.failed_request_handler", + "target": 2446, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.failed_request_handler", + "target": 2446, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a function to handle skipped requests.\n\nThe skipped request handler is invoked when a request is skipped due to a collision or other reasons." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3332, + "module": "crawlers._basic._basic_crawler", + "name": "on_skipped_request", + "parsedDocstring": { + "text": "Register a function to handle skipped requests.\n\nThe skipped request handler is invoked when a request is skipped due to a collision or other reasons." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 569 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a function to handle skipped requests.\n\nThe skipped request handler is invoked when a request is skipped due to a collision or other reasons." + } + ] + }, + "flags": {}, + "id": 2450, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "on_skipped_request", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2451, + "kind": 32768, + "kindString": "Parameter", + "name": "callback", + "type": { + "name": "SkippedRequestCallback", + "type": "reference", + "target": "2366" + } + } + ], + "type": { + "name": "SkippedRequestCallback", + "type": "reference", + "target": "2366" + }, + "inheritedFrom": { + "name": "BasicCrawler.on_skipped_request", + "target": 2449, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.on_skipped_request", + "target": 2449, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Run the crawler until all requests are processed.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3333, + "module": "crawlers._basic._basic_crawler", + "name": "run", + "parsedDocstring": { + "text": "Run the crawler until all requests are processed.\n", + "args": { + "requests": "The requests to be enqueued before the crawler starts.", + "purge_request_queue": "If this is `True` and the crawler is not being run for the first time, the default\nrequest queue will be purged." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 577 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Run the crawler until all requests are processed.\n" + } + ] + }, + "flags": {}, + "id": 2453, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "run", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The requests to be enqueued before the crawler starts." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 2454, + "kind": 32768, + "kindString": "Parameter", + "name": "requests", + "type": { + "name": "Sequence[str | Request] | None", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "If this is `True` and the crawler is not being run for the first time, the default\nrequest queue will be purged." + } + ] + }, + "defaultValue": "True", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2455, + "kind": 32768, + "kindString": "Parameter", + "name": "purge_request_queue", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "FinalStatistics", + "type": "reference", + "target": "1453" + }, + "inheritedFrom": { + "name": "BasicCrawler.run", + "target": 2452, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.run", + "target": 2452, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Add requests to the underlying request manager in batches.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3334, + "module": "crawlers._basic._basic_crawler", + "name": "add_requests", + "parsedDocstring": { + "text": "Add requests to the underlying request manager in batches.\n", + "args": { + "requests": "A list of requests to add to the queue.", + "batch_size": "The number of requests to add in one batch.", + "wait_time_between_batches": "Time to wait between adding batches.", + "wait_for_all_requests_to_be_added": "If True, wait for all requests to be added before returning.", + "wait_for_all_requests_to_be_added_timeout": "Timeout for waiting for all requests to be added." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 685 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Add requests to the underlying request manager in batches.\n" + } + ] + }, + "flags": {}, + "id": 2457, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "add_requests", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A list of requests to add to the queue." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2458, + "kind": 32768, + "kindString": "Parameter", + "name": "requests", + "type": { + "name": "Sequence[str | Request]", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The number of requests to add in one batch." + } + ] + }, + "defaultValue": "1000", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2459, + "kind": 32768, + "kindString": "Parameter", + "name": "batch_size", + "type": { + "name": "int", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Time to wait between adding batches." + } + ] + }, + "defaultValue": "timedelta(0)", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2460, + "kind": 32768, + "kindString": "Parameter", + "name": "wait_time_between_batches", + "type": { + "name": "timedelta", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, wait for all requests to be added before returning." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2461, + "kind": 32768, + "kindString": "Parameter", + "name": "wait_for_all_requests_to_be_added", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Timeout for waiting for all requests to be added." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2462, + "kind": 32768, + "kindString": "Parameter", + "name": "wait_for_all_requests_to_be_added_timeout", + "type": { + "name": "timedelta | None", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.add_requests", + "target": 2456, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.add_requests", + "target": 2456, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Retrieve data from a `Dataset`.\n\nThis helper method simplifies the process of retrieving data from a `Dataset`. It opens the specified\none and then retrieves the data based on the provided parameters.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3335, + "module": "crawlers._basic._basic_crawler", + "name": "get_data", + "parsedDocstring": { + "text": "Retrieve data from a `Dataset`.\n\nThis helper method simplifies the process of retrieving data from a `Dataset`. It opens the specified\none and then retrieves the data based on the provided parameters.\n", + "args": { + "dataset_id": "The ID of the `Dataset`.", + "dataset_name": "The name of the `Dataset`.", + "kwargs": "Keyword arguments to be passed to the `Dataset.get_data()` method.\n" + }, + "returns": "The retrieved data." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 738 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "The retrieved data." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Retrieve data from a `Dataset`.\n\nThis helper method simplifies the process of retrieving data from a `Dataset`. It opens the specified\none and then retrieves the data based on the provided parameters.\n" + } + ] + }, + "flags": {}, + "id": 2464, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_data", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The ID of the `Dataset`." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 2465, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_id", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The name of the `Dataset`." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 2466, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_name", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Keyword arguments to be passed to the `Dataset.get_data()` method.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2467, + "kind": 32768, + "kindString": "Parameter", + "name": "kwargs", + "type": { + "name": "Unpack[GetDataKwargs]", + "type": "reference" + } + } + ], + "type": { + "name": "DatasetItemsListPage", + "type": "reference", + "target": "823" + }, + "inheritedFrom": { + "name": "BasicCrawler.get_data", + "target": 2463, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.get_data", + "target": 2463, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Export data from a `Dataset`.\n\nThis helper method simplifies the process of exporting data from a `Dataset`. It opens the specified\none and then exports the data based on the provided parameters. If you need to pass options\nspecific to the output format, use the `export_data_csv` or `export_data_json` method instead.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3336, + "module": "crawlers._basic._basic_crawler", + "name": "export_data", + "parsedDocstring": { + "text": "Export data from a `Dataset`.\n\nThis helper method simplifies the process of exporting data from a `Dataset`. It opens the specified\none and then exports the data based on the provided parameters. If you need to pass options\nspecific to the output format, use the `export_data_csv` or `export_data_json` method instead.\n", + "args": { + "path": "The destination path.", + "dataset_id": "The ID of the `Dataset`.", + "dataset_name": "The name of the `Dataset`." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 760 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Export data from a `Dataset`.\n\nThis helper method simplifies the process of exporting data from a `Dataset`. It opens the specified\none and then exports the data based on the provided parameters. If you need to pass options\nspecific to the output format, use the `export_data_csv` or `export_data_json` method instead.\n" + } + ] + }, + "flags": {}, + "id": 2469, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "export_data", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The destination path." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2470, + "kind": 32768, + "kindString": "Parameter", + "name": "path", + "type": { + "name": "str | Path", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The ID of the `Dataset`." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 2471, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_id", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The name of the `Dataset`." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 2472, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_name", + "type": { + "name": "str | None", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.export_data", + "target": 2468, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.export_data", + "target": 2468, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Export data from a `Dataset` to a CSV file.\n\nThis helper method simplifies the process of exporting data from a `Dataset` in csv format. It opens\nthe specified one and then exports the data based on the provided parameters.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3337, + "module": "crawlers._basic._basic_crawler", + "name": "export_data_csv", + "parsedDocstring": { + "text": "Export data from a `Dataset` to a CSV file.\n\nThis helper method simplifies the process of exporting data from a `Dataset` in csv format. It opens\nthe specified one and then exports the data based on the provided parameters.\n", + "args": { + "path": "The destination path.", + "content_type": "The output format.", + "dataset_id": "The ID of the `Dataset`.", + "dataset_name": "The name of the `Dataset`.", + "kwargs": "Extra configurations for dumping/writing in csv format." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 789 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Export data from a `Dataset` to a CSV file.\n\nThis helper method simplifies the process of exporting data from a `Dataset` in csv format. It opens\nthe specified one and then exports the data based on the provided parameters.\n" + } + ] + }, + "flags": {}, + "id": 2474, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "export_data_csv", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The destination path." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2475, + "kind": 32768, + "kindString": "Parameter", + "name": "path", + "type": { + "name": "str | Path", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The ID of the `Dataset`." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2476, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_id", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The name of the `Dataset`." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2477, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_name", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Extra configurations for dumping/writing in csv format." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2478, + "kind": 32768, + "kindString": "Parameter", + "name": "kwargs", + "type": { + "name": "Unpack[ExportDataCsvKwargs]", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.export_data_csv", + "target": 2473, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.export_data_csv", + "target": 2473, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Export data from a `Dataset` to a JSON file.\n\nThis helper method simplifies the process of exporting data from a `Dataset` in json format. It opens the\nspecified one and then exports the data based on the provided parameters.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3338, + "module": "crawlers._basic._basic_crawler", + "name": "export_data_json", + "parsedDocstring": { + "text": "Export data from a `Dataset` to a JSON file.\n\nThis helper method simplifies the process of exporting data from a `Dataset` in json format. It opens the\nspecified one and then exports the data based on the provided parameters.\n", + "args": { + "path": "The destination path", + "dataset_id": "The ID of the `Dataset`.", + "dataset_name": "The name of the `Dataset`.", + "kwargs": "Extra configurations for dumping/writing in json format." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 814 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Export data from a `Dataset` to a JSON file.\n\nThis helper method simplifies the process of exporting data from a `Dataset` in json format. It opens the\nspecified one and then exports the data based on the provided parameters.\n" + } + ] + }, + "flags": {}, + "id": 2480, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "export_data_json", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The destination path" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2481, + "kind": 32768, + "kindString": "Parameter", + "name": "path", + "type": { + "name": "str | Path", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The ID of the `Dataset`." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2482, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_id", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The name of the `Dataset`." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2483, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_name", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Extra configurations for dumping/writing in json format." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2484, + "kind": 32768, + "kindString": "Parameter", + "name": "kwargs", + "type": { + "name": "Unpack[ExportDataJsonKwargs]", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.export_data_json", + "target": 2479, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.export_data_json", + "target": 2479, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "An adaptive web crawler capable of using both static HTTP request based crawling and browser based crawling.\n\nIt uses a more limited crawling context interface so that it is able to switch to HTTP-only crawling when it detects\nthat it may bring a performance benefit.\nIt uses specific implementation of `AbstractHttpCrawler` and `PlaywrightCrawler`.\n\n### Usage\n```python\nfrom crawlee.crawlers import AdaptivePlaywrightCrawler, AdaptivePlaywrightCrawlingContext\n\ncrawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser(\n max_requests_per_crawl=5, playwright_crawler_specific_kwargs={'browser_type': 'chromium'}\n)\n\n@crawler.router.default_handler\nasync def request_handler_for_label(context: AdaptivePlaywrightCrawlingContext) -> None:\n # Do some processing using `parsed_content`\n context.log.info(context.parsed_content.title)\n\n # Locate element h2 within 5 seconds\n h2 = await context.query_selector_one('h2', timedelta(milliseconds=5000))\n # Do stuff with element found by the selector\n context.log.info(h2)\n\n # Find more links and enqueue them.\n await context.enqueue_links()\n # Save some data.\n await context.push_data({'Visited url': context.request.url})\n\nawait crawler.run(['https://crawlee.dev/'])\n```" + } + ] + }, + "decorations": [ + { + "args": "('Classes')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 2584, + 3334, + 3330, + 3336, + 3337, + 3338, + 3331, + 3335, + 3328, + 3329, + 3327, + 3332, + 2610, + 3333, + 3326, + 2616, + 2614, + 2618, + 2593, + 2602 + ], + "title": "Methods" + }, + { + "children": [ + 3323, + 3324, + 3325 + ], + "title": "Properties" + } + ], + "id": 2583, + "module": "crawlers._adaptive_playwright._adaptive_playwright_crawler", + "name": "AdaptivePlaywrightCrawler", + "parsedDocstring": { + "text": "An adaptive web crawler capable of using both static HTTP request based crawling and browser based crawling.\n\nIt uses a more limited crawling context interface so that it is able to switch to HTTP-only crawling when it detects\nthat it may bring a performance benefit.\nIt uses specific implementation of `AbstractHttpCrawler` and `PlaywrightCrawler`.\n\n### Usage\n```python\nfrom crawlee.crawlers import AdaptivePlaywrightCrawler, AdaptivePlaywrightCrawlingContext\n\ncrawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser(\n max_requests_per_crawl=5, playwright_crawler_specific_kwargs={'browser_type': 'chromium'}\n)\n\n@crawler.router.default_handler\nasync def request_handler_for_label(context: AdaptivePlaywrightCrawlingContext) -> None:\n # Do some processing using `parsed_content`\n context.log.info(context.parsed_content.title)\n\n # Locate element h2 within 5 seconds\n h2 = await context.query_selector_one('h2', timedelta(milliseconds=5000))\n # Do stuff with element found by the selector\n context.log.info(h2)\n\n # Find more links and enqueue them.\n await context.enqueue_links()\n # Save some data.\n await context.push_data({'Visited url': context.request.url})\n\nawait crawler.run(['https://crawlee.dev/'])\n```" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 87 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "BasicCrawler", + "target": "2394", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2621, + "module": "crawlers._adaptive_playwright._adaptive_playwright_crawler", + "name": "result", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 455 + } + ], + "type": { + "name": "RequestHandlerRunResult | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "RequestHandlerRunResult", + "target": "212" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2622, + "module": "crawlers._adaptive_playwright._adaptive_playwright_crawler", + "name": "exception", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 456 + } + ], + "type": { + "name": "Exception | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Exception" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "args": "(frozen=True)", + "name": "dataclass" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 2622, + 2621 + ], + "title": "Properties" + } + ], + "id": 2620, + "module": "crawlers._adaptive_playwright._adaptive_playwright_crawler", + "name": "SubCrawlerRun", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 454 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2623, + "module": "crawlers._abstract_http._http_crawling_context", + "name": "TParseResult", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_abstract_http/_http_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 12 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2624, + "module": "crawlers._abstract_http._http_crawling_context", + "name": "TSelectResult", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_abstract_http/_http_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 13 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance from an existing `BasicCrawlingContext`." + } + ] + }, + "decorations": [ + { + "name": "classmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2626, + "module": "crawlers._abstract_http._http_crawling_context", + "name": "from_basic_crawling_context", + "parsedDocstring": { + "text": "Initialize a new instance from an existing `BasicCrawlingContext`." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_abstract_http/_http_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 22 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance from an existing `BasicCrawlingContext`." + } + ] + }, + "flags": {}, + "id": 2627, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "from_basic_crawling_context", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2628, + "kind": 32768, + "kindString": "Parameter", + "name": "context", + "type": { + "name": "BasicCrawlingContext", + "type": "reference", + "target": "309" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2629, + "kind": 32768, + "kindString": "Parameter", + "name": "http_response", + "type": { + "name": "HttpResponse", + "type": "reference", + "target": "1909" + } + } + ], + "type": { + "name": "Self", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get snapshot of crawled page." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2630, + "module": "crawlers._abstract_http._http_crawling_context", + "name": "get_snapshot", + "parsedDocstring": { + "text": "Get snapshot of crawled page." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_abstract_http/_http_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 27 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get snapshot of crawled page." + } + ] + }, + "flags": {}, + "id": 2631, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_snapshot", + "parameters": [], + "type": { + "name": "PageSnapshot", + "type": "reference", + "target": "297" + }, + "overwrites": { + "name": "BasicCrawlingContext.get_snapshot", + "target": 319, + "type": "reference" + } + } + ], + "overwrites": { + "name": "BasicCrawlingContext.get_snapshot", + "target": 319, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The HTTP response received from the server." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3218, + "module": "http_clients._base", + "name": "http_response", + "parsedDocstring": { + "text": "The HTTP response received from the server." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/http_clients/_base.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 70 + } + ], + "type": { + "name": "HttpResponse", + "type": "reference", + "target": "1909" + }, + "inheritedFrom": { + "name": "HttpCrawlingResult.http_response", + "target": 1918, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Request object for the current page being processed." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3430, + "module": "_types", + "name": "request", + "parsedDocstring": { + "text": "Request object for the current page being processed." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 627 + } + ], + "type": { + "name": "Request", + "type": "reference", + "target": "398" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.request", + "target": 310, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Session object for the current page being processed." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3431, + "module": "_types", + "name": "session", + "parsedDocstring": { + "text": "Session object for the current page being processed." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 630 + } + ], + "type": { + "name": "Session | None", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.session", + "target": 311, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Proxy information for the current page being processed." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3432, + "module": "_types", + "name": "proxy_info", + "parsedDocstring": { + "text": "Proxy information for the current page being processed." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 633 + } + ], + "type": { + "name": "ProxyInfo | None", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.proxy_info", + "target": 312, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Send request crawling context helper function." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3433, + "module": "_types", + "name": "send_request", + "parsedDocstring": { + "text": "Send request crawling context helper function." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 636 + } + ], + "type": { + "name": "SendRequestFunction", + "type": "reference", + "target": "290" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.send_request", + "target": 313, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Add requests crawling context helper function." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3434, + "module": "_types", + "name": "add_requests", + "parsedDocstring": { + "text": "Add requests crawling context helper function." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 639 + } + ], + "type": { + "name": "AddRequestsFunction", + "type": "reference", + "target": "230" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.add_requests", + "target": 314, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Push data crawling context helper function." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3435, + "module": "_types", + "name": "push_data", + "parsedDocstring": { + "text": "Push data crawling context helper function." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 642 + } + ], + "type": { + "name": "PushDataFunction", + "type": "reference", + "target": "283" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.push_data", + "target": 315, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Use state crawling context helper function." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3436, + "module": "_types", + "name": "use_state", + "parsedDocstring": { + "text": "Use state crawling context helper function." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 645 + } + ], + "type": { + "name": "UseStateFunction", + "type": "reference", + "target": "305" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.use_state", + "target": 316, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get key-value store crawling context helper function." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3437, + "module": "_types", + "name": "get_key_value_store", + "parsedDocstring": { + "text": "Get key-value store crawling context helper function." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 648 + } + ], + "type": { + "name": "GetKeyValueStoreFromRequestHandlerFunction", + "type": "reference", + "target": "278" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.get_key_value_store", + "target": 317, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Logger instance." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3438, + "module": "_types", + "name": "log", + "parsedDocstring": { + "text": "Logger instance." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 651 + } + ], + "type": { + "name": "logging.Logger", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.log", + "target": 318, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return hash of the context. Each context is considered unique." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3439, + "module": "_types", + "name": "__hash__", + "parsedDocstring": { + "text": "Return hash of the context. Each context is considered unique." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 658 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return hash of the context. Each context is considered unique." + } + ] + }, + "flags": {}, + "id": 322, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__hash__", + "parameters": [], + "type": { + "name": "int", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.__hash__", + "target": 321, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawlingContext.__hash__", + "target": 321, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The crawling context used by the `AbstractHttpCrawler`." + } + ] + }, + "decorations": [ + { + "args": "(frozen=True)", + "name": "dataclass" + }, + { + "args": "('Data structures')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 3439, + 2626, + 2630 + ], + "title": "Methods" + }, + { + "children": [ + 3434, + 3437, + 3218, + 3438, + 3432, + 3435, + 3430, + 3433, + 3431, + 3436 + ], + "title": "Properties" + } + ], + "id": 2625, + "module": "crawlers._abstract_http._http_crawling_context", + "name": "HttpCrawlingContext", + "parsedDocstring": { + "text": "The crawling context used by the `AbstractHttpCrawler`." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_abstract_http/_http_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 18 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "HttpCrawlingResult", + "target": "1917", + "type": "reference" + }, + { + "name": "BasicCrawlingContext", + "target": "309", + "type": "reference" + } + ], + "extendedBy": [ + { + "name": "ParsedHttpCrawlingContext", + "target": "2632", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2633, + "module": "crawlers._abstract_http._http_crawling_context", + "name": "parsed_content", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_abstract_http/_http_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 40 + } + ], + "type": { + "name": "TParseResult", + "type": "reference", + "target": "2623" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2634, + "module": "crawlers._abstract_http._http_crawling_context", + "name": "enqueue_links", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_abstract_http/_http_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 41 + } + ], + "type": { + "name": "EnqueueLinksFunction", + "type": "reference", + "target": "235" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2635, + "module": "crawlers._abstract_http._http_crawling_context", + "name": "extract_links", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_abstract_http/_http_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 42 + } + ], + "type": { + "name": "ExtractLinksFunction", + "type": "reference", + "target": "253" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance from an existing `HttpCrawlingContext`." + } + ] + }, + "decorations": [ + { + "name": "classmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2636, + "module": "crawlers._abstract_http._http_crawling_context", + "name": "from_http_crawling_context", + "parsedDocstring": { + "text": "Initialize a new instance from an existing `HttpCrawlingContext`." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_abstract_http/_http_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 45 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance from an existing `HttpCrawlingContext`." + } + ] + }, + "flags": {}, + "id": 2637, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "from_http_crawling_context", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2638, + "kind": 32768, + "kindString": "Parameter", + "name": "context", + "type": { + "name": "HttpCrawlingContext", + "type": "reference", + "target": "2625" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2639, + "kind": 32768, + "kindString": "Parameter", + "name": "parsed_content", + "type": { + "name": "TParseResult", + "type": "reference", + "target": "2623" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2640, + "kind": 32768, + "kindString": "Parameter", + "name": "enqueue_links", + "type": { + "name": "EnqueueLinksFunction", + "type": "reference", + "target": "235" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2641, + "kind": 32768, + "kindString": "Parameter", + "name": "extract_links", + "type": { + "name": "ExtractLinksFunction", + "type": "reference", + "target": "253" + } + } + ], + "type": { + "name": "Self", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance from an existing `BasicCrawlingContext`." + } + ] + }, + "decorations": [ + { + "name": "classmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3440, + "module": "crawlers._abstract_http._http_crawling_context", + "name": "from_basic_crawling_context", + "parsedDocstring": { + "text": "Initialize a new instance from an existing `BasicCrawlingContext`." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_abstract_http/_http_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 22 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance from an existing `BasicCrawlingContext`." + } + ] + }, + "flags": {}, + "id": 2627, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "from_basic_crawling_context", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2628, + "kind": 32768, + "kindString": "Parameter", + "name": "context", + "type": { + "name": "BasicCrawlingContext", + "type": "reference", + "target": "309" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2629, + "kind": 32768, + "kindString": "Parameter", + "name": "http_response", + "type": { + "name": "HttpResponse", + "type": "reference", + "target": "1909" + } + } + ], + "type": { + "name": "Self", + "type": "reference" + }, + "inheritedFrom": { + "name": "HttpCrawlingContext.from_basic_crawling_context", + "target": 2626, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "HttpCrawlingContext.from_basic_crawling_context", + "target": 2626, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get snapshot of crawled page." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3441, + "module": "crawlers._abstract_http._http_crawling_context", + "name": "get_snapshot", + "parsedDocstring": { + "text": "Get snapshot of crawled page." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_abstract_http/_http_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 27 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get snapshot of crawled page." + } + ] + }, + "flags": {}, + "id": 2631, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_snapshot", + "parameters": [], + "type": { + "name": "PageSnapshot", + "type": "reference", + "target": "297" + }, + "overwrites": { + "name": "BasicCrawlingContext.get_snapshot", + "target": 319, + "type": "reference" + }, + "inheritedFrom": { + "name": "HttpCrawlingContext.get_snapshot", + "target": 2630, + "type": "reference" + } + } + ], + "overwrites": { + "name": "BasicCrawlingContext.get_snapshot", + "target": 319, + "type": "reference" + }, + "inheritedFrom": { + "name": "HttpCrawlingContext.get_snapshot", + "target": 2630, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The HTTP response received from the server." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3442, + "module": "http_clients._base", + "name": "http_response", + "parsedDocstring": { + "text": "The HTTP response received from the server." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/http_clients/_base.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 70 + } + ], + "type": { + "name": "HttpResponse", + "type": "reference", + "target": "1909" + }, + "inheritedFrom": { + "name": "HttpCrawlingResult.http_response", + "target": 1918, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Request object for the current page being processed." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3443, + "module": "_types", + "name": "request", + "parsedDocstring": { + "text": "Request object for the current page being processed." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 627 + } + ], + "type": { + "name": "Request", + "type": "reference", + "target": "398" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.request", + "target": 310, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Session object for the current page being processed." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3444, + "module": "_types", + "name": "session", + "parsedDocstring": { + "text": "Session object for the current page being processed." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 630 + } + ], + "type": { + "name": "Session | None", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.session", + "target": 311, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Proxy information for the current page being processed." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3445, + "module": "_types", + "name": "proxy_info", + "parsedDocstring": { + "text": "Proxy information for the current page being processed." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 633 + } + ], + "type": { + "name": "ProxyInfo | None", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.proxy_info", + "target": 312, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Send request crawling context helper function." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3446, + "module": "_types", + "name": "send_request", + "parsedDocstring": { + "text": "Send request crawling context helper function." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 636 + } + ], + "type": { + "name": "SendRequestFunction", + "type": "reference", + "target": "290" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.send_request", + "target": 313, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Add requests crawling context helper function." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3447, + "module": "_types", + "name": "add_requests", + "parsedDocstring": { + "text": "Add requests crawling context helper function." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 639 + } + ], + "type": { + "name": "AddRequestsFunction", + "type": "reference", + "target": "230" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.add_requests", + "target": 314, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Push data crawling context helper function." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3448, + "module": "_types", + "name": "push_data", + "parsedDocstring": { + "text": "Push data crawling context helper function." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 642 + } + ], + "type": { + "name": "PushDataFunction", + "type": "reference", + "target": "283" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.push_data", + "target": 315, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Use state crawling context helper function." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3449, + "module": "_types", + "name": "use_state", + "parsedDocstring": { + "text": "Use state crawling context helper function." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 645 + } + ], + "type": { + "name": "UseStateFunction", + "type": "reference", + "target": "305" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.use_state", + "target": 316, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get key-value store crawling context helper function." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3450, + "module": "_types", + "name": "get_key_value_store", + "parsedDocstring": { + "text": "Get key-value store crawling context helper function." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 648 + } + ], + "type": { + "name": "GetKeyValueStoreFromRequestHandlerFunction", + "type": "reference", + "target": "278" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.get_key_value_store", + "target": 317, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Logger instance." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3451, + "module": "_types", + "name": "log", + "parsedDocstring": { + "text": "Logger instance." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 651 + } + ], + "type": { + "name": "logging.Logger", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.log", + "target": 318, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return hash of the context. Each context is considered unique." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3452, + "module": "_types", + "name": "__hash__", + "parsedDocstring": { + "text": "Return hash of the context. Each context is considered unique." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 658 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return hash of the context. Each context is considered unique." + } + ] + }, + "flags": {}, + "id": 322, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__hash__", + "parameters": [], + "type": { + "name": "int", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.__hash__", + "target": 321, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawlingContext.__hash__", + "target": 321, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The crawling context used by `AbstractHttpCrawler`.\n\nIt provides access to key objects as well as utility functions for handling crawling tasks." + } + ] + }, + "decorations": [ + { + "args": "(frozen=True)", + "name": "dataclass" + }, + { + "args": "('Data structures')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 3452, + 3440, + 2636, + 3441 + ], + "title": "Methods" + }, + { + "children": [ + 3447, + 2634, + 2635, + 3450, + 3442, + 3451, + 2633, + 3445, + 3448, + 3443, + 3446, + 3444, + 3449 + ], + "title": "Properties" + } + ], + "id": 2632, + "module": "crawlers._abstract_http._http_crawling_context", + "name": "ParsedHttpCrawlingContext", + "parsedDocstring": { + "text": "The crawling context used by `AbstractHttpCrawler`.\n\nIt provides access to key objects as well as utility functions for handling crawling tasks." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_abstract_http/_http_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 34 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "HttpCrawlingContext", + "target": "2625", + "type": "reference" + } + ], + "extendedBy": [ + { + "name": "ParselCrawlingContext", + "target": "2266", + "type": "reference" + }, + { + "name": "BeautifulSoupCrawlingContext", + "target": "2329", + "type": "reference" + }, + { + "name": "AdaptivePlaywrightCrawlingContext", + "target": "2531", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Parse http response.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2643, + "module": "crawlers._abstract_http._abstract_http_parser", + "name": "parse", + "parsedDocstring": { + "text": "Parse http response.\n", + "args": { + "response": "HTTP response to be parsed.\n" + }, + "returns": "Parsed HTTP response." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_abstract_http/_abstract_http_parser.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 23 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "Parsed HTTP response." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Parse http response.\n" + } + ] + }, + "flags": {}, + "id": 2644, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "parse", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP response to be parsed.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2645, + "kind": 32768, + "kindString": "Parameter", + "name": "response", + "type": { + "name": "HttpResponse", + "type": "reference", + "target": "1909" + } + } + ], + "type": { + "name": "TParseResult", + "type": "reference", + "target": "2623" + }, + "overwrites": { + "name": "AbstractHttpParser.parse", + "target": 2643, + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Parse text containing html.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2646, + "module": "crawlers._abstract_http._abstract_http_parser", + "name": "parse_text", + "parsedDocstring": { + "text": "Parse text containing html.\n", + "args": { + "text": "String containing html.\n" + }, + "returns": "Parsed text." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_abstract_http/_abstract_http_parser.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 34 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "Parsed text." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Parse text containing html.\n" + } + ] + }, + "flags": {}, + "id": 2647, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "parse_text", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "String containing html.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2648, + "kind": 32768, + "kindString": "Parameter", + "name": "text", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "TParseResult", + "type": "reference", + "target": "2623" + }, + "overwrites": { + "name": "AbstractHttpParser.parse_text", + "target": 2646, + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Use css selector to select page element and return it.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2649, + "module": "crawlers._abstract_http._abstract_http_parser", + "name": "select", + "parsedDocstring": { + "text": "Use css selector to select page element and return it.\n", + "args": { + "parsed_content": "Content where the page element will be located.", + "selector": "Css selector used to locate desired html element.\n" + }, + "returns": "Selected element." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_abstract_http/_abstract_http_parser.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 45 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "Selected element." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Use css selector to select page element and return it.\n" + } + ] + }, + "flags": {}, + "id": 2650, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "select", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Content where the page element will be located." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2651, + "kind": 32768, + "kindString": "Parameter", + "name": "parsed_content", + "type": { + "name": "TParseResult", + "type": "reference", + "target": "2623" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Css selector used to locate desired html element.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2652, + "kind": 32768, + "kindString": "Parameter", + "name": "selector", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "Sequence", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "TSelectResult", + "target": "2624" + } + ] + }, + "overwrites": { + "name": "AbstractHttpParser.select", + "target": 2649, + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Detect if blocked and return BlockedInfo with additional information.\n\nDefault implementation that expects `is_matching_selector` abstract method to be implemented.\nOverride this method if your parser has different way of blockage detection.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2653, + "module": "crawlers._abstract_http._abstract_http_parser", + "name": "is_blocked", + "parsedDocstring": { + "text": "Detect if blocked and return BlockedInfo with additional information.\n\nDefault implementation that expects `is_matching_selector` abstract method to be implemented.\nOverride this method if your parser has different way of blockage detection.\n", + "args": { + "parsed_content": "Parsed HTTP response. Result of `parse` method.\n" + }, + "returns": "`BlockedInfo` object that contains non-empty string description of reason if blockage was detected. Empty\nstring in reason signifies no blockage detected." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_abstract_http/_abstract_http_parser.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 56 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "`BlockedInfo` object that contains non-empty string description of reason if blockage was detected. Empty\nstring in reason signifies no blockage detected." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Detect if blocked and return BlockedInfo with additional information.\n\nDefault implementation that expects `is_matching_selector` abstract method to be implemented.\nOverride this method if your parser has different way of blockage detection.\n" + } + ] + }, + "flags": {}, + "id": 2654, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "is_blocked", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Parsed HTTP response. Result of `parse` method.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2655, + "kind": 32768, + "kindString": "Parameter", + "name": "parsed_content", + "type": { + "name": "TParseResult", + "type": "reference", + "target": "2623" + } + } + ], + "type": { + "name": "BlockedInfo", + "type": "reference", + "target": "2150" + }, + "overwrites": { + "name": "AbstractHttpParser.is_blocked", + "target": 2653, + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Find if selector has match in parsed content.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2656, + "module": "crawlers._abstract_http._abstract_http_parser", + "name": "is_matching_selector", + "parsedDocstring": { + "text": "Find if selector has match in parsed content.\n", + "args": { + "parsed_content": "Parsed HTTP response. Result of `parse` method.", + "selector": "String used to define matching pattern.\n" + }, + "returns": "True if selector has match in parsed content." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_abstract_http/_abstract_http_parser.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 84 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "True if selector has match in parsed content." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Find if selector has match in parsed content.\n" + } + ] + }, + "flags": {}, + "id": 2657, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "is_matching_selector", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Parsed HTTP response. Result of `parse` method." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2658, + "kind": 32768, + "kindString": "Parameter", + "name": "parsed_content", + "type": { + "name": "TParseResult", + "type": "reference", + "target": "2623" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "String used to define matching pattern.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2659, + "kind": 32768, + "kindString": "Parameter", + "name": "selector", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "bool", + "type": "reference" + }, + "overwrites": { + "name": "AbstractHttpParser.is_matching_selector", + "target": 2656, + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Find all links in result using selector.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2660, + "module": "crawlers._abstract_http._abstract_http_parser", + "name": "find_links", + "parsedDocstring": { + "text": "Find all links in result using selector.\n", + "args": { + "parsed_content": "Parsed HTTP response. Result of `parse` method.", + "selector": "String used to define matching pattern for finding links.\n" + }, + "returns": "Iterable of strings that contain found links." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_abstract_http/_abstract_http_parser.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 96 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "Iterable of strings that contain found links." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Find all links in result using selector.\n" + } + ] + }, + "flags": {}, + "id": 2661, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "find_links", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Parsed HTTP response. Result of `parse` method." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2662, + "kind": 32768, + "kindString": "Parameter", + "name": "parsed_content", + "type": { + "name": "TParseResult", + "type": "reference", + "target": "2623" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "String used to define matching pattern for finding links.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2663, + "kind": 32768, + "kindString": "Parameter", + "name": "selector", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "Iterable", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + }, + "overwrites": { + "name": "AbstractHttpParser.find_links", + "target": 2660, + "type": "reference" + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Parser used for parsing http response and inspecting parsed result to find links or detect blocking." + } + ] + }, + "decorations": [ + { + "args": "('Abstract classes')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 2660, + 2653, + 2656, + 2643, + 2646, + 2649 + ], + "title": "Methods" + } + ], + "id": 2642, + "module": "crawlers._abstract_http._abstract_http_parser", + "name": "AbstractHttpParser", + "parsedDocstring": { + "text": "Parser used for parsing http response and inspecting parsed result to find links or detect blocking." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_abstract_http/_abstract_http_parser.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 19 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedBy": [ + { + "name": "ParselParser", + "target": "2247", + "type": "reference" + }, + { + "name": "NoParser", + "target": "2277", + "type": "reference" + }, + { + "name": "BeautifulSoupParser", + "target": "2306", + "type": "reference" + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2664, + "module": "crawlers._abstract_http._abstract_http_crawler", + "name": "TCrawlingContext", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 31 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2665, + "module": "crawlers._abstract_http._abstract_http_crawler", + "name": "TStatisticsState", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 32 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2667, + "module": "crawlers._basic._basic_crawler", + "name": "__init__", + "parsedDocstring": { + "text": "Initialize a new instance.\n", + "args": { + "configuration": "The `Configuration` instance. Some of its properties are used as defaults for the crawler.", + "event_manager": "The event manager for managing events for the crawler and all its components.", + "storage_client": "The storage client for managing storages for the crawler and all its components.", + "request_manager": "Manager of requests that should be processed by the crawler.", + "session_pool": "A custom `SessionPool` instance, allowing the use of non-default configuration.", + "proxy_configuration": "HTTP proxy configuration used when making requests.", + "http_client": "HTTP client used by `BasicCrawlingContext.send_request` method.", + "request_handler": "A callable responsible for handling requests.", + "max_request_retries": "Specifies the maximum number of retries allowed for a request if its processing fails.\nThis includes retries due to navigation errors or errors thrown from user-supplied functions\n(`request_handler`, `pre_navigation_hooks` etc.).\n\nThis limit does not apply to retries triggered by session rotation (see `max_session_rotations`).", + "max_requests_per_crawl": "Maximum number of pages to open during a crawl. The crawl stops upon reaching\nthis limit. Setting this value can help avoid infinite loops in misconfigured crawlers. `None` means\nno limit. Due to concurrency settings, the actual number of pages visited may slightly exceed\nthis value. If used together with `keep_alive`, then the crawler will be kept alive only until\n`max_requests_per_crawl` is achieved.", + "max_session_rotations": "Maximum number of session rotations per request. The crawler rotates the session\nif a proxy error occurs or if the website blocks the request.\n\nThe session rotations are not counted towards the `max_request_retries` limit.", + "max_crawl_depth": "Specifies the maximum crawl depth. If set, the crawler will stop processing links beyond\nthis depth. The crawl depth starts at 0 for initial requests and increases with each subsequent level\nof links. Requests at the maximum depth will still be processed, but no new links will be enqueued\nfrom those requests. If not set, crawling continues without depth restrictions.", + "use_session_pool": "Enable the use of a session pool for managing sessions during crawling.", + "retry_on_blocked": "If True, the crawler attempts to bypass bot protections automatically.", + "additional_http_error_status_codes": "Additional HTTP status codes to treat as errors,\ntriggering automatic retries when encountered.", + "ignore_http_error_status_codes": "HTTP status codes that are typically considered errors but should be treated\nas successful responses.", + "concurrency_settings": "Settings to fine-tune concurrency levels.", + "request_handler_timeout": "Maximum duration allowed for a single request handler to run.", + "statistics": "A custom `Statistics` instance, allowing the use of non-default configuration.", + "abort_on_error": "If True, the crawler stops immediately when any request handler error occurs.", + "keep_alive": "If True, it will keep crawler alive even if there are no requests in queue.\nUse `crawler.stop()` to exit the crawler.", + "configure_logging": "If True, the crawler will set up logging infrastructure automatically.", + "statistics_log_format": "If 'table', displays crawler statistics as formatted tables in logs. If 'inline',\noutputs statistics as plain text log messages.", + "respect_robots_txt_file": "If set to `True`, the crawler will automatically try to fetch the robots.txt file\nfor each domain, and skip those that are not allowed. This also prevents disallowed URLs to be added\nvia `EnqueueLinksFunction`", + "_context_pipeline": "Enables extending the request lifecycle and modifying the crawling context.\nIntended for use by subclasses rather than direct instantiation of `BasicCrawler`.", + "_additional_context_managers": "Additional context managers used throughout the crawler lifecycle.\nIntended for use by subclasses rather than direct instantiation of `BasicCrawler`.", + "_logger": "A logger instance, typically provided by a subclass, for consistent logging labels.\nIntended for use by subclasses rather than direct instantiation of `BasicCrawler`." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 53 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n" + } + ] + }, + "flags": {}, + "id": 2396, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The `Configuration` instance. Some of its properties are used as defaults for the crawler." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2397, + "kind": 32768, + "kindString": "Parameter", + "name": "configuration", + "type": { + "name": "Configuration | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Configuration", + "target": "93" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The event manager for managing events for the crawler and all its components." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2398, + "kind": 32768, + "kindString": "Parameter", + "name": "event_manager", + "type": { + "name": "EventManager | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "EventManager", + "target": "2086" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The storage client for managing storages for the crawler and all its components." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2399, + "kind": 32768, + "kindString": "Parameter", + "name": "storage_client", + "type": { + "name": "StorageClient | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "StorageClient", + "target": "1189" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Manager of requests that should be processed by the crawler." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2400, + "kind": 32768, + "kindString": "Parameter", + "name": "request_manager", + "type": { + "name": "RequestManager | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "RequestManager", + "target": "1752" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A custom `SessionPool` instance, allowing the use of non-default configuration." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2401, + "kind": 32768, + "kindString": "Parameter", + "name": "session_pool", + "type": { + "name": "SessionPool | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "SessionPool", + "target": "1534" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP proxy configuration used when making requests." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2402, + "kind": 32768, + "kindString": "Parameter", + "name": "proxy_configuration", + "type": { + "name": "ProxyConfiguration | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ProxyConfiguration", + "target": "24" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP client used by `BasicCrawlingContext.send_request` method." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2403, + "kind": 32768, + "kindString": "Parameter", + "name": "http_client", + "type": { + "name": "HttpClient | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "HttpClient", + "target": "1919" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A callable responsible for handling requests." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2404, + "kind": 32768, + "kindString": "Parameter", + "name": "request_handler", + "type": { + "name": "Callable[[TCrawlingContext], Awaitable[None]] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Callable", + "typeArguments": [ + { + "type": "reference", + "name": "[TCrawlingContext]" + }, + { + "type": "reference", + "name": "Awaitable", + "typeArguments": [ + { + "type": "literal", + "value": null + } + ] + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Specifies the maximum number of retries allowed for a request if its processing fails.\nThis includes retries due to navigation errors or errors thrown from user-supplied functions\n(`request_handler`, `pre_navigation_hooks` etc.).\n\nThis limit does not apply to retries triggered by session rotation (see `max_session_rotations`)." + } + ] + }, + "defaultValue": "3", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2405, + "kind": 32768, + "kindString": "Parameter", + "name": "max_request_retries", + "type": { + "name": "int", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum number of pages to open during a crawl. The crawl stops upon reaching\nthis limit. Setting this value can help avoid infinite loops in misconfigured crawlers. `None` means\nno limit. Due to concurrency settings, the actual number of pages visited may slightly exceed\nthis value. If used together with `keep_alive`, then the crawler will be kept alive only until\n`max_requests_per_crawl` is achieved." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2406, + "kind": 32768, + "kindString": "Parameter", + "name": "max_requests_per_crawl", + "type": { + "name": "int | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum number of session rotations per request. The crawler rotates the session\nif a proxy error occurs or if the website blocks the request.\n\nThe session rotations are not counted towards the `max_request_retries` limit." + } + ] + }, + "defaultValue": "10", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2407, + "kind": 32768, + "kindString": "Parameter", + "name": "max_session_rotations", + "type": { + "name": "int", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Specifies the maximum crawl depth. If set, the crawler will stop processing links beyond\nthis depth. The crawl depth starts at 0 for initial requests and increases with each subsequent level\nof links. Requests at the maximum depth will still be processed, but no new links will be enqueued\nfrom those requests. If not set, crawling continues without depth restrictions." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2408, + "kind": 32768, + "kindString": "Parameter", + "name": "max_crawl_depth", + "type": { + "name": "int | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Enable the use of a session pool for managing sessions during crawling." + } + ] + }, + "defaultValue": "True", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2409, + "kind": 32768, + "kindString": "Parameter", + "name": "use_session_pool", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, the crawler attempts to bypass bot protections automatically." + } + ] + }, + "defaultValue": "True", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2410, + "kind": 32768, + "kindString": "Parameter", + "name": "retry_on_blocked", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Additional HTTP status codes to treat as errors,\ntriggering automatic retries when encountered." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2411, + "kind": 32768, + "kindString": "Parameter", + "name": "additional_http_error_status_codes", + "type": { + "name": "Iterable[int] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Iterable", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP status codes that are typically considered errors but should be treated\nas successful responses." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2412, + "kind": 32768, + "kindString": "Parameter", + "name": "ignore_http_error_status_codes", + "type": { + "name": "Iterable[int] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Iterable", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Settings to fine-tune concurrency levels." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2413, + "kind": 32768, + "kindString": "Parameter", + "name": "concurrency_settings", + "type": { + "name": "ConcurrencySettings | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ConcurrencySettings", + "target": "146" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum duration allowed for a single request handler to run." + } + ] + }, + "defaultValue": "timedelta(minutes=1)", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2414, + "kind": 32768, + "kindString": "Parameter", + "name": "request_handler_timeout", + "type": { + "name": "timedelta", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A custom `Statistics` instance, allowing the use of non-default configuration." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2415, + "kind": 32768, + "kindString": "Parameter", + "name": "statistics", + "type": { + "name": "Statistics[TStatisticsState] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Statistics", + "typeArguments": [ + { + "type": "reference", + "name": "TStatisticsState", + "target": "1392" + } + ], + "target": "1403" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, the crawler stops immediately when any request handler error occurs." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2416, + "kind": 32768, + "kindString": "Parameter", + "name": "abort_on_error", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, it will keep crawler alive even if there are no requests in queue.\nUse `crawler.stop()` to exit the crawler." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2417, + "kind": 32768, + "kindString": "Parameter", + "name": "keep_alive", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, the crawler will set up logging infrastructure automatically." + } + ] + }, + "defaultValue": "True", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2418, + "kind": 32768, + "kindString": "Parameter", + "name": "configure_logging", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "If 'table', displays crawler statistics as formatted tables in logs. If 'inline',\noutputs statistics as plain text log messages." + } + ] + }, + "defaultValue": "'table'", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2419, + "kind": 32768, + "kindString": "Parameter", + "name": "statistics_log_format", + "type": { + "name": "Literal", + "type": "reference", + "typeArguments": [ + { + "type": "literal", + "value": "table" + }, + { + "type": "literal", + "value": "inline" + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "If set to `True`, the crawler will automatically try to fetch the robots.txt file\nfor each domain, and skip those that are not allowed. This also prevents disallowed URLs to be added\nvia `EnqueueLinksFunction`" + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2420, + "kind": 32768, + "kindString": "Parameter", + "name": "respect_robots_txt_file", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Enables extending the request lifecycle and modifying the crawling context.\nIntended for use by subclasses rather than direct instantiation of `BasicCrawler`." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2421, + "kind": 32768, + "kindString": "Parameter", + "name": "_context_pipeline", + "type": { + "name": "ContextPipeline[TCrawlingContext] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ContextPipeline", + "typeArguments": [ + { + "type": "reference", + "name": "TCrawlingContext", + "target": "1" + } + ], + "target": "2349" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Additional context managers used throughout the crawler lifecycle.\nIntended for use by subclasses rather than direct instantiation of `BasicCrawler`." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2422, + "kind": 32768, + "kindString": "Parameter", + "name": "_additional_context_managers", + "type": { + "name": "Sequence[AbstractAsyncContextManager] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Sequence", + "typeArguments": [ + { + "type": "reference", + "name": "AbstractAsyncContextManager" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A logger instance, typically provided by a subclass, for consistent logging labels.\nIntended for use by subclasses rather than direct instantiation of `BasicCrawler`." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2423, + "kind": 32768, + "kindString": "Parameter", + "name": "_logger", + "type": { + "name": "logging.Logger | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "logging.Logger" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "BasicCrawler.__init__", + "target": 2395, + "type": "reference" + } + } + ], + "overwrites": { + "name": "BasicCrawler.__init__", + "target": 2395, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a specific version of `AbstractHttpCrawler` class.\n\nThis is a convenience factory method for creating a specific `AbstractHttpCrawler` subclass.\nWhile `AbstractHttpCrawler` allows its two generic parameters to be independent,\nthis method simplifies cases where `TParseResult` is used for both generic parameters." + } + ] + }, + "decorations": [ + { + "name": "classmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2671, + "module": "crawlers._abstract_http._abstract_http_crawler", + "name": "create_parsed_http_crawler_class", + "parsedDocstring": { + "text": "Create a specific version of `AbstractHttpCrawler` class.\n\nThis is a convenience factory method for creating a specific `AbstractHttpCrawler` subclass.\nWhile `AbstractHttpCrawler` allows its two generic parameters to be independent,\nthis method simplifies cases where `TParseResult` is used for both generic parameters." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 72 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a specific version of `AbstractHttpCrawler` class.\n\nThis is a convenience factory method for creating a specific `AbstractHttpCrawler` subclass.\nWhile `AbstractHttpCrawler` allows its two generic parameters to be independent,\nthis method simplifies cases where `TParseResult` is used for both generic parameters." + } + ] + }, + "flags": {}, + "id": 2672, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "create_parsed_http_crawler_class", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2673, + "kind": 32768, + "kindString": "Parameter", + "name": "static_parser", + "type": { + "name": "AbstractHttpParser", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "TParseResult", + "target": "2623" + }, + { + "type": "reference", + "name": "TSelectResult", + "target": "2624" + } + ], + "target": "2642" + } + } + ], + "type": { + "name": "type", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "AbstractHttpCrawler", + "typeArguments": [ + { + "type": "reference", + "name": "ParsedHttpCrawlingContext", + "typeArguments": [ + { + "type": "reference", + "name": "TParseResult", + "target": "2623" + } + ], + "target": "2632" + }, + { + "type": "reference", + "name": "TParseResult", + "target": "2623" + }, + { + "type": "reference", + "name": "TSelectResult", + "target": "2624" + } + ], + "target": "2666" + } + ] + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a hook to be called before each navigation.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2674, + "module": "crawlers._abstract_http._abstract_http_crawler", + "name": "pre_navigation_hook", + "parsedDocstring": { + "text": "Register a hook to be called before each navigation.\n", + "args": { + "hook": "A coroutine function to be called before each navigation." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 264 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a hook to be called before each navigation.\n" + } + ] + }, + "flags": {}, + "id": 2675, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "pre_navigation_hook", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A coroutine function to be called before each navigation." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2676, + "kind": 32768, + "kindString": "Parameter", + "name": "hook", + "type": { + "name": "Callable", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "[BasicCrawlingContext]" + }, + { + "type": "reference", + "name": "Awaitable", + "typeArguments": [ + { + "type": "literal", + "value": null + } + ] + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The logger used by the crawler." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 3339, + "module": "crawlers._basic._basic_crawler", + "name": "log", + "parsedDocstring": { + "text": "The logger used by the crawler." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 440 + } + ], + "type": { + "name": "logging.Logger", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.log", + "target": 2424, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The `Router` used to handle each individual crawling request." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 3340, + "module": "crawlers._basic._basic_crawler", + "name": "router", + "parsedDocstring": { + "text": "The `Router` used to handle each individual crawling request." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 445 + } + ], + "type": { + "name": "Router[TCrawlingContext]", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.router", + "target": 2425, + "type": "reference" + }, + "overwrites": { + "name": "BasicCrawler.router", + "target": 2426, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Statistics about the current (or last) crawler run." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 3341, + "module": "crawlers._basic._basic_crawler", + "name": "statistics", + "parsedDocstring": { + "text": "Statistics about the current (or last) crawler run." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 460 + } + ], + "type": { + "name": "Statistics[TStatisticsState]", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.statistics", + "target": 2429, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Set flag to stop crawler.\n\nThis stops current crawler run regardless of whether all requests were finished.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3342, + "module": "crawlers._basic._basic_crawler", + "name": "stop", + "parsedDocstring": { + "text": "Set flag to stop crawler.\n\nThis stops current crawler run regardless of whether all requests were finished.\n", + "args": { + "reason": "Reason for stopping that will be used in logs." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 464 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Set flag to stop crawler.\n\nThis stops current crawler run regardless of whether all requests were finished.\n" + } + ] + }, + "flags": {}, + "id": 2431, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "stop", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Reason for stopping that will be used in logs." + } + ] + }, + "defaultValue": "'Stop was called externally.'", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 2432, + "kind": 32768, + "kindString": "Parameter", + "name": "reason", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.stop", + "target": 2430, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.stop", + "target": 2430, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the configured request manager. If none is configured, open and return the default request queue." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3343, + "module": "crawlers._basic._basic_crawler", + "name": "get_request_manager", + "parsedDocstring": { + "text": "Return the configured request manager. If none is configured, open and return the default request queue." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 524 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the configured request manager. If none is configured, open and return the default request queue." + } + ] + }, + "flags": {}, + "id": 2434, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_request_manager", + "parameters": [], + "type": { + "name": "RequestManager", + "type": "reference", + "target": "1752" + }, + "inheritedFrom": { + "name": "BasicCrawler.get_request_manager", + "target": 2433, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.get_request_manager", + "target": 2433, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the `Dataset` with the given ID or name. If none is provided, return the default one." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3344, + "module": "crawlers._basic._basic_crawler", + "name": "get_dataset", + "parsedDocstring": { + "text": "Return the `Dataset` with the given ID or name. If none is provided, return the default one." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 531 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the `Dataset` with the given ID or name. If none is provided, return the default one." + } + ] + }, + "flags": {}, + "id": 2436, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_dataset", + "parameters": [ + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2437, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2438, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str | None", + "type": "reference" + } + } + ], + "type": { + "name": "Dataset", + "type": "reference", + "target": "662" + }, + "inheritedFrom": { + "name": "BasicCrawler.get_dataset", + "target": 2435, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.get_dataset", + "target": 2435, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the `KeyValueStore` with the given ID or name. If none is provided, return the default KVS." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3345, + "module": "crawlers._basic._basic_crawler", + "name": "get_key_value_store", + "parsedDocstring": { + "text": "Return the `KeyValueStore` with the given ID or name. If none is provided, return the default KVS." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 540 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the `KeyValueStore` with the given ID or name. If none is provided, return the default KVS." + } + ] + }, + "flags": {}, + "id": 2440, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_key_value_store", + "parameters": [ + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2441, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2442, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str | None", + "type": "reference" + } + } + ], + "type": { + "name": "KeyValueStore", + "type": "reference", + "target": "569" + }, + "inheritedFrom": { + "name": "BasicCrawler.get_key_value_store", + "target": 2439, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.get_key_value_store", + "target": 2439, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a function to handle errors occurring in request handlers.\n\nThe error handler is invoked after a request handler error occurs and before a retry attempt." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3346, + "module": "crawlers._basic._basic_crawler", + "name": "error_handler", + "parsedDocstring": { + "text": "Register a function to handle errors occurring in request handlers.\n\nThe error handler is invoked after a request handler error occurs and before a retry attempt." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 549 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a function to handle errors occurring in request handlers.\n\nThe error handler is invoked after a request handler error occurs and before a retry attempt." + } + ] + }, + "flags": {}, + "id": 2444, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "error_handler", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2445, + "kind": 32768, + "kindString": "Parameter", + "name": "handler", + "type": { + "name": "ErrorHandler[TCrawlingContext | BasicCrawlingContext]", + "type": "reference" + } + } + ], + "type": { + "name": "ErrorHandler[TCrawlingContext]", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.error_handler", + "target": 2443, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.error_handler", + "target": 2443, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a function to handle requests that exceed the maximum retry limit.\n\nThe failed request handler is invoked when a request has failed all retry attempts." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3347, + "module": "crawlers._basic._basic_crawler", + "name": "failed_request_handler", + "parsedDocstring": { + "text": "Register a function to handle requests that exceed the maximum retry limit.\n\nThe failed request handler is invoked when a request has failed all retry attempts." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 559 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a function to handle requests that exceed the maximum retry limit.\n\nThe failed request handler is invoked when a request has failed all retry attempts." + } + ] + }, + "flags": {}, + "id": 2447, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "failed_request_handler", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2448, + "kind": 32768, + "kindString": "Parameter", + "name": "handler", + "type": { + "name": "FailedRequestHandler[TCrawlingContext | BasicCrawlingContext]", + "type": "reference" + } + } + ], + "type": { + "name": "FailedRequestHandler[TCrawlingContext]", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.failed_request_handler", + "target": 2446, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.failed_request_handler", + "target": 2446, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a function to handle skipped requests.\n\nThe skipped request handler is invoked when a request is skipped due to a collision or other reasons." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3348, + "module": "crawlers._basic._basic_crawler", + "name": "on_skipped_request", + "parsedDocstring": { + "text": "Register a function to handle skipped requests.\n\nThe skipped request handler is invoked when a request is skipped due to a collision or other reasons." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 569 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a function to handle skipped requests.\n\nThe skipped request handler is invoked when a request is skipped due to a collision or other reasons." + } + ] + }, + "flags": {}, + "id": 2450, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "on_skipped_request", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2451, + "kind": 32768, + "kindString": "Parameter", + "name": "callback", + "type": { + "name": "SkippedRequestCallback", + "type": "reference", + "target": "2366" + } + } + ], + "type": { + "name": "SkippedRequestCallback", + "type": "reference", + "target": "2366" + }, + "inheritedFrom": { + "name": "BasicCrawler.on_skipped_request", + "target": 2449, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.on_skipped_request", + "target": 2449, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Run the crawler until all requests are processed.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3349, + "module": "crawlers._basic._basic_crawler", + "name": "run", + "parsedDocstring": { + "text": "Run the crawler until all requests are processed.\n", + "args": { + "requests": "The requests to be enqueued before the crawler starts.", + "purge_request_queue": "If this is `True` and the crawler is not being run for the first time, the default\nrequest queue will be purged." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 577 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Run the crawler until all requests are processed.\n" + } + ] + }, + "flags": {}, + "id": 2453, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "run", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The requests to be enqueued before the crawler starts." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 2454, + "kind": 32768, + "kindString": "Parameter", + "name": "requests", + "type": { + "name": "Sequence[str | Request] | None", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "If this is `True` and the crawler is not being run for the first time, the default\nrequest queue will be purged." + } + ] + }, + "defaultValue": "True", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2455, + "kind": 32768, + "kindString": "Parameter", + "name": "purge_request_queue", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "FinalStatistics", + "type": "reference", + "target": "1453" + }, + "inheritedFrom": { + "name": "BasicCrawler.run", + "target": 2452, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.run", + "target": 2452, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Add requests to the underlying request manager in batches.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3350, + "module": "crawlers._basic._basic_crawler", + "name": "add_requests", + "parsedDocstring": { + "text": "Add requests to the underlying request manager in batches.\n", + "args": { + "requests": "A list of requests to add to the queue.", + "batch_size": "The number of requests to add in one batch.", + "wait_time_between_batches": "Time to wait between adding batches.", + "wait_for_all_requests_to_be_added": "If True, wait for all requests to be added before returning.", + "wait_for_all_requests_to_be_added_timeout": "Timeout for waiting for all requests to be added." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 685 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Add requests to the underlying request manager in batches.\n" + } + ] + }, + "flags": {}, + "id": 2457, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "add_requests", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A list of requests to add to the queue." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2458, + "kind": 32768, + "kindString": "Parameter", + "name": "requests", + "type": { + "name": "Sequence[str | Request]", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The number of requests to add in one batch." + } + ] + }, + "defaultValue": "1000", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2459, + "kind": 32768, + "kindString": "Parameter", + "name": "batch_size", + "type": { + "name": "int", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Time to wait between adding batches." + } + ] + }, + "defaultValue": "timedelta(0)", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2460, + "kind": 32768, + "kindString": "Parameter", + "name": "wait_time_between_batches", + "type": { + "name": "timedelta", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, wait for all requests to be added before returning." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2461, + "kind": 32768, + "kindString": "Parameter", + "name": "wait_for_all_requests_to_be_added", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Timeout for waiting for all requests to be added." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2462, + "kind": 32768, + "kindString": "Parameter", + "name": "wait_for_all_requests_to_be_added_timeout", + "type": { + "name": "timedelta | None", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.add_requests", + "target": 2456, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.add_requests", + "target": 2456, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Retrieve data from a `Dataset`.\n\nThis helper method simplifies the process of retrieving data from a `Dataset`. It opens the specified\none and then retrieves the data based on the provided parameters.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3351, + "module": "crawlers._basic._basic_crawler", + "name": "get_data", + "parsedDocstring": { + "text": "Retrieve data from a `Dataset`.\n\nThis helper method simplifies the process of retrieving data from a `Dataset`. It opens the specified\none and then retrieves the data based on the provided parameters.\n", + "args": { + "dataset_id": "The ID of the `Dataset`.", + "dataset_name": "The name of the `Dataset`.", + "kwargs": "Keyword arguments to be passed to the `Dataset.get_data()` method.\n" + }, + "returns": "The retrieved data." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 738 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "The retrieved data." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Retrieve data from a `Dataset`.\n\nThis helper method simplifies the process of retrieving data from a `Dataset`. It opens the specified\none and then retrieves the data based on the provided parameters.\n" + } + ] + }, + "flags": {}, + "id": 2464, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_data", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The ID of the `Dataset`." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 2465, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_id", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The name of the `Dataset`." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 2466, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_name", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Keyword arguments to be passed to the `Dataset.get_data()` method.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2467, + "kind": 32768, + "kindString": "Parameter", + "name": "kwargs", + "type": { + "name": "Unpack[GetDataKwargs]", + "type": "reference" + } + } + ], + "type": { + "name": "DatasetItemsListPage", + "type": "reference", + "target": "823" + }, + "inheritedFrom": { + "name": "BasicCrawler.get_data", + "target": 2463, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.get_data", + "target": 2463, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Export data from a `Dataset`.\n\nThis helper method simplifies the process of exporting data from a `Dataset`. It opens the specified\none and then exports the data based on the provided parameters. If you need to pass options\nspecific to the output format, use the `export_data_csv` or `export_data_json` method instead.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3352, + "module": "crawlers._basic._basic_crawler", + "name": "export_data", + "parsedDocstring": { + "text": "Export data from a `Dataset`.\n\nThis helper method simplifies the process of exporting data from a `Dataset`. It opens the specified\none and then exports the data based on the provided parameters. If you need to pass options\nspecific to the output format, use the `export_data_csv` or `export_data_json` method instead.\n", + "args": { + "path": "The destination path.", + "dataset_id": "The ID of the `Dataset`.", + "dataset_name": "The name of the `Dataset`." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 760 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Export data from a `Dataset`.\n\nThis helper method simplifies the process of exporting data from a `Dataset`. It opens the specified\none and then exports the data based on the provided parameters. If you need to pass options\nspecific to the output format, use the `export_data_csv` or `export_data_json` method instead.\n" + } + ] + }, + "flags": {}, + "id": 2469, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "export_data", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The destination path." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2470, + "kind": 32768, + "kindString": "Parameter", + "name": "path", + "type": { + "name": "str | Path", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The ID of the `Dataset`." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 2471, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_id", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The name of the `Dataset`." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 2472, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_name", + "type": { + "name": "str | None", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.export_data", + "target": 2468, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.export_data", + "target": 2468, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Export data from a `Dataset` to a CSV file.\n\nThis helper method simplifies the process of exporting data from a `Dataset` in csv format. It opens\nthe specified one and then exports the data based on the provided parameters.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3353, + "module": "crawlers._basic._basic_crawler", + "name": "export_data_csv", + "parsedDocstring": { + "text": "Export data from a `Dataset` to a CSV file.\n\nThis helper method simplifies the process of exporting data from a `Dataset` in csv format. It opens\nthe specified one and then exports the data based on the provided parameters.\n", + "args": { + "path": "The destination path.", + "content_type": "The output format.", + "dataset_id": "The ID of the `Dataset`.", + "dataset_name": "The name of the `Dataset`.", + "kwargs": "Extra configurations for dumping/writing in csv format." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 789 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Export data from a `Dataset` to a CSV file.\n\nThis helper method simplifies the process of exporting data from a `Dataset` in csv format. It opens\nthe specified one and then exports the data based on the provided parameters.\n" + } + ] + }, + "flags": {}, + "id": 2474, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "export_data_csv", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The destination path." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2475, + "kind": 32768, + "kindString": "Parameter", + "name": "path", + "type": { + "name": "str | Path", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The ID of the `Dataset`." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2476, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_id", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The name of the `Dataset`." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2477, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_name", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Extra configurations for dumping/writing in csv format." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2478, + "kind": 32768, + "kindString": "Parameter", + "name": "kwargs", + "type": { + "name": "Unpack[ExportDataCsvKwargs]", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.export_data_csv", + "target": 2473, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.export_data_csv", + "target": 2473, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Export data from a `Dataset` to a JSON file.\n\nThis helper method simplifies the process of exporting data from a `Dataset` in json format. It opens the\nspecified one and then exports the data based on the provided parameters.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3354, + "module": "crawlers._basic._basic_crawler", + "name": "export_data_json", + "parsedDocstring": { + "text": "Export data from a `Dataset` to a JSON file.\n\nThis helper method simplifies the process of exporting data from a `Dataset` in json format. It opens the\nspecified one and then exports the data based on the provided parameters.\n", + "args": { + "path": "The destination path", + "dataset_id": "The ID of the `Dataset`.", + "dataset_name": "The name of the `Dataset`.", + "kwargs": "Extra configurations for dumping/writing in json format." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 814 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Export data from a `Dataset` to a JSON file.\n\nThis helper method simplifies the process of exporting data from a `Dataset` in json format. It opens the\nspecified one and then exports the data based on the provided parameters.\n" + } + ] + }, + "flags": {}, + "id": 2480, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "export_data_json", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The destination path" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2481, + "kind": 32768, + "kindString": "Parameter", + "name": "path", + "type": { + "name": "str | Path", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The ID of the `Dataset`." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2482, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_id", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The name of the `Dataset`." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2483, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_name", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Extra configurations for dumping/writing in json format." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2484, + "kind": 32768, + "kindString": "Parameter", + "name": "kwargs", + "type": { + "name": "Unpack[ExportDataJsonKwargs]", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.export_data_json", + "target": 2479, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.export_data_json", + "target": 2479, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A web crawler for performing HTTP requests.\n\nThe `AbstractHttpCrawler` builds on top of the `BasicCrawler`, inheriting all its features. Additionally,\nit implements HTTP communication using HTTP clients. The class allows integration with any HTTP client\nthat implements the `HttpClient` interface, provided as an input parameter to the constructor.\n\n`AbstractHttpCrawler` is a generic class intended to be used with a specific parser for parsing HTTP responses\nand the expected type of `TCrawlingContext` available to the user function. Examples of specific versions include\n`BeautifulSoupCrawler`, `ParselCrawler`, and `HttpCrawler`.\n\nHTTP client-based crawlers are ideal for websites that do not require JavaScript execution. For websites that\nrequire client-side JavaScript execution, consider using a browser-based crawler like the `PlaywrightCrawler`." + } + ] + }, + "decorations": [ + { + "args": "('Abstract classes')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 2667, + 3350, + 2671, + 3346, + 3352, + 3353, + 3354, + 3347, + 3351, + 3344, + 3345, + 3343, + 3348, + 2674, + 3349, + 3342 + ], + "title": "Methods" + }, + { + "children": [ + 3339, + 3340, + 3341 + ], + "title": "Properties" + } + ], + "id": 2666, + "module": "crawlers._abstract_http._abstract_http_crawler", + "name": "AbstractHttpCrawler", + "parsedDocstring": { + "text": "A web crawler for performing HTTP requests.\n\nThe `AbstractHttpCrawler` builds on top of the `BasicCrawler`, inheriting all its features. Additionally,\nit implements HTTP communication using HTTP clients. The class allows integration with any HTTP client\nthat implements the `HttpClient` interface, provided as an input parameter to the constructor.\n\n`AbstractHttpCrawler` is a generic class intended to be used with a specific parser for parsing HTTP responses\nand the expected type of `TCrawlingContext` available to the user function. Examples of specific versions include\n`BeautifulSoupCrawler`, `ParselCrawler`, and `HttpCrawler`.\n\nHTTP client-based crawlers are ideal for websites that do not require JavaScript execution. For websites that\nrequire client-side JavaScript execution, consider using a browser-based crawler like the `PlaywrightCrawler`." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 36 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "BasicCrawler", + "target": "2394", + "type": "reference" + } + ], + "extendedBy": [ + { + "name": "ParselCrawler", + "target": "2273", + "type": "reference" + }, + { + "name": "HttpCrawler", + "target": "2299", + "type": "reference" + }, + { + "name": "BeautifulSoupCrawler", + "target": "2336", + "type": "reference" + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2677, + "module": "browsers._types", + "name": "BrowserType", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/browsers/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 9 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2679, + "module": "browsers._types", + "name": "id", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/browsers/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 16 + } + ], + "type": { + "name": "str", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2680, + "module": "browsers._types", + "name": "browser_type", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/browsers/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 17 + } + ], + "type": { + "name": "BrowserType", + "type": "reference", + "target": "2677" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2681, + "module": "browsers._types", + "name": "page", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/browsers/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 18 + } + ], + "type": { + "name": "Page", + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Represents a page object within a browser, with additional metadata for tracking and management." + } + ] + }, + "decorations": [ + { + "name": "dataclass" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 2680, + 2679, + 2681 + ], + "title": "Properties" + } + ], + "id": 2678, + "module": "browsers._types", + "name": "CrawleePage", + "parsedDocstring": { + "text": "Represents a page object within a browser, with additional metadata for tracking and management." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/browsers/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 13 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2682, + "module": "browsers._playwright_browser_plugin", + "name": "logger", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/browsers/_playwright_browser_plugin.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 28 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The name of the automation library that the plugin is managing." + } + ] + }, + "flags": {}, + "groups": [], + "id": 2684, + "module": "browsers._browser_plugin", + "name": "AUTOMATION_LIBRARY", + "parsedDocstring": { + "text": "The name of the automation library that the plugin is managing." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/browsers/_playwright_browser_plugin.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 42 + } + ], + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + }, + "overwrites": { + "name": "BrowserPlugin.AUTOMATION_LIBRARY", + "target": 2803, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2685, + "module": "browsers._playwright_browser_plugin", + "name": "__init__", + "parsedDocstring": { + "text": "Initialize a new instance.\n", + "args": { + "browser_type": "The type of browser to launch ('chromium', 'firefox', or 'webkit').", + "user_data_dir": "Path to a User Data Directory, which stores browser session data like cookies and local\nstorage.", + "browser_launch_options": "Keyword arguments to pass to the browser launch method. These options are provided\ndirectly to Playwright's `browser_type.launch` method. For more details, refer to the Playwright\ndocumentation: https://playwright.dev/python/docs/api/class-browsertype#browser-type-launch.", + "browser_new_context_options": "Keyword arguments to pass to the browser new context method. These options\nare provided directly to Playwright's `browser.new_context` method. For more details, refer to the\nPlaywright documentation: https://playwright.dev/python/docs/api/class-browser#browser-new-context.", + "max_open_pages_per_browser": "The maximum number of pages that can be opened in a single browser instance.\nOnce reached, a new browser instance will be launched to handle the excess.", + "use_incognito_pages": "By default pages share the same browser context. If set to True each page uses its\nown context that is destroyed once the page is closed or crashes.", + "fingerprint_generator": "An optional instance of implementation of `FingerprintGenerator` that is used\nto generate browser fingerprints together with consistent headers." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/browsers/_playwright_browser_plugin.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 44 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n" + } + ] + }, + "flags": {}, + "id": 2686, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The type of browser to launch ('chromium', 'firefox', or 'webkit')." + } + ] + }, + "defaultValue": "'chromium'", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2687, + "kind": 32768, + "kindString": "Parameter", + "name": "browser_type", + "type": { + "name": "BrowserType", + "type": "reference", + "target": "2677" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Path to a User Data Directory, which stores browser session data like cookies and local\nstorage." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2688, + "kind": 32768, + "kindString": "Parameter", + "name": "user_data_dir", + "type": { + "name": "str | Path | None", + "type": "union", + "types": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Path" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Keyword arguments to pass to the browser launch method. These options are provided\ndirectly to Playwright's `browser_type.launch` method. For more details, refer to the Playwright\ndocumentation: https://playwright.dev/python/docs/api/class-browsertype#browser-type-launch." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2689, + "kind": 32768, + "kindString": "Parameter", + "name": "browser_launch_options", + "type": { + "name": "dict[str, Any] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "dict", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Any" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Keyword arguments to pass to the browser new context method. These options\nare provided directly to Playwright's `browser.new_context` method. For more details, refer to the\nPlaywright documentation: https://playwright.dev/python/docs/api/class-browser#browser-new-context." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2690, + "kind": 32768, + "kindString": "Parameter", + "name": "browser_new_context_options", + "type": { + "name": "dict[str, Any] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "dict", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Any" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The maximum number of pages that can be opened in a single browser instance.\nOnce reached, a new browser instance will be launched to handle the excess." + } + ] + }, + "defaultValue": "20", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2691, + "kind": 32768, + "kindString": "Parameter", + "name": "max_open_pages_per_browser", + "type": { + "name": "int", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "By default pages share the same browser context. If set to True each page uses its\nown context that is destroyed once the page is closed or crashes." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2692, + "kind": 32768, + "kindString": "Parameter", + "name": "use_incognito_pages", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "An optional instance of implementation of `FingerprintGenerator` that is used\nto generate browser fingerprints together with consistent headers." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2693, + "kind": 32768, + "kindString": "Parameter", + "name": "fingerprint_generator", + "type": { + "name": "FingerprintGenerator | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "FingerprintGenerator", + "target": "1991" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Indicate whether the context is active." + } + ] + }, + "decorations": [ + { + "name": "property" + }, + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2694, + "module": "browsers._browser_plugin", + "name": "active", + "parsedDocstring": { + "text": "Indicate whether the context is active." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/browsers/_playwright_browser_plugin.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 100 + } + ], + "type": { + "name": "bool", + "type": "reference" + }, + "overwrites": { + "name": "BrowserPlugin.active", + "target": 2804, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the browser type name." + } + ] + }, + "decorations": [ + { + "name": "property" + }, + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2695, + "module": "browsers._browser_plugin", + "name": "browser_type", + "parsedDocstring": { + "text": "Return the browser type name." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/browsers/_playwright_browser_plugin.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 105 + } + ], + "type": { + "name": "BrowserType", + "type": "reference", + "target": "2677" + }, + "overwrites": { + "name": "BrowserPlugin.browser_type", + "target": 2805, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the options for the `browser.launch` method.\n\nKeyword arguments to pass to the browser launch method. These options are provided directly to Playwright's\n`browser_type.launch` method. For more details, refer to the Playwright documentation:\nhttps://playwright.dev/python/docs/api/class-browsertype#browser-type-launch." + } + ] + }, + "decorations": [ + { + "name": "property" + }, + { + "name": "override" + } + ], + "flags": {}, + "groups": [], + "id": 2696, + "module": "browsers._playwright_browser_plugin", + "name": "browser_launch_options", + "parsedDocstring": { + "text": "Return the options for the `browser.launch` method.\n\nKeyword arguments to pass to the browser launch method. These options are provided directly to Playwright's\n`browser_type.launch` method. For more details, refer to the Playwright documentation:\nhttps://playwright.dev/python/docs/api/class-browsertype#browser-type-launch." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/browsers/_playwright_browser_plugin.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 110 + } + ], + "type": { + "name": "Mapping", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Any" + } + ] + }, + "overwrites": { + "name": "BrowserPlugin.browser_launch_options", + "target": 2806, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the options for the `browser.new_context` method.\n\nKeyword arguments to pass to the browser new context method. These options are provided directly to Playwright's\n`browser.new_context` method. For more details, refer to the Playwright documentation:\nhttps://playwright.dev/python/docs/api/class-browser#browser-new-context." + } + ] + }, + "decorations": [ + { + "name": "property" + }, + { + "name": "override" + } + ], + "flags": {}, + "groups": [], + "id": 2697, + "module": "browsers._playwright_browser_plugin", + "name": "browser_new_context_options", + "parsedDocstring": { + "text": "Return the options for the `browser.new_context` method.\n\nKeyword arguments to pass to the browser new context method. These options are provided directly to Playwright's\n`browser.new_context` method. For more details, refer to the Playwright documentation:\nhttps://playwright.dev/python/docs/api/class-browser#browser-new-context." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/browsers/_playwright_browser_plugin.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 121 + } + ], + "type": { + "name": "Mapping", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Any" + } + ] + }, + "overwrites": { + "name": "BrowserPlugin.browser_new_context_options", + "target": 2807, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the maximum number of pages that can be opened in a single browser." + } + ] + }, + "decorations": [ + { + "name": "property" + }, + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2698, + "module": "browsers._browser_plugin", + "name": "max_open_pages_per_browser", + "parsedDocstring": { + "text": "Return the maximum number of pages that can be opened in a single browser." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/browsers/_playwright_browser_plugin.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 132 + } + ], + "type": { + "name": "int", + "type": "reference" + }, + "overwrites": { + "name": "BrowserPlugin.max_open_pages_per_browser", + "target": 2808, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Enter the context manager and initialize the browser plugin.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2699, + "module": "browsers._browser_plugin", + "name": "__aenter__", + "parsedDocstring": { + "text": "Enter the context manager and initialize the browser plugin.\n" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/browsers/_playwright_browser_plugin.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 136 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Enter the context manager and initialize the browser plugin.\n" + } + ] + }, + "flags": {}, + "id": 2810, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "__aenter__", + "parameters": [], + "type": { + "name": "BrowserPlugin", + "type": "reference", + "target": "2802" + }, + "overwrites": { + "name": "BrowserPlugin.__aenter__", + "target": 2809, + "type": "reference" + } + } + ], + "overwrites": { + "name": "BrowserPlugin.__aenter__", + "target": 2809, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Exit the context manager and close the browser plugin.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2701, + "module": "browsers._browser_plugin", + "name": "__aexit__", + "parsedDocstring": { + "text": "Exit the context manager and close the browser plugin.\n" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/browsers/_playwright_browser_plugin.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 145 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Exit the context manager and close the browser plugin.\n" + } + ] + }, + "flags": {}, + "id": 2812, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "__aexit__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2813, + "kind": 32768, + "kindString": "Parameter", + "name": "exc_type", + "type": { + "name": "type[BaseException] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "type", + "typeArguments": [ + { + "type": "reference", + "name": "BaseException" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2814, + "kind": 32768, + "kindString": "Parameter", + "name": "exc_value", + "type": { + "name": "BaseException | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "BaseException" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2815, + "kind": 32768, + "kindString": "Parameter", + "name": "exc_traceback", + "type": { + "name": "TracebackType | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "TracebackType" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "BrowserPlugin.__aexit__", + "target": 2811, + "type": "reference" + } + } + ], + "overwrites": { + "name": "BrowserPlugin.__aexit__", + "target": 2811, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a new browser instance.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2706, + "module": "browsers._browser_plugin", + "name": "new_browser", + "parsedDocstring": { + "text": "Create a new browser instance.\n", + "returns": "A new browser instance wrapped in a controller." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/browsers/_playwright_browser_plugin.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 160 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "A new browser instance wrapped in a controller." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Create a new browser instance.\n" + } + ] + }, + "flags": {}, + "id": 2817, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "new_browser", + "parameters": [], + "type": { + "name": "BrowserController", + "type": "reference", + "target": "2818" + }, + "overwrites": { + "name": "BrowserPlugin.new_browser", + "target": 2816, + "type": "reference" + } + } + ], + "overwrites": { + "name": "BrowserPlugin.new_browser", + "target": 2816, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A plugin for managing Playwright automation library.\n\nIt is a plugin designed to manage browser instances using the Playwright automation library. It acts as a factory\nfor creating new browser instances and provides a unified interface for interacting with different browser types\n(chromium, firefox, and webkit). This class integrates configuration options for browser launches (headless mode,\nexecutable paths, sandboxing, ...). It also manages browser contexts and the number of pages open within each\nbrowser instance, ensuring that resource limits are respected." + } + ] + }, + "decorations": [ + { + "args": "('Classes')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 2699, + 2701, + 2685, + 2706 + ], + "title": "Methods" + }, + { + "children": [ + 2694, + 2684, + 2696, + 2697, + 2695, + 2698 + ], + "title": "Properties" + } + ], + "id": 2683, + "module": "browsers._playwright_browser_plugin", + "name": "PlaywrightBrowserPlugin", + "parsedDocstring": { + "text": "A plugin for managing Playwright automation library.\n\nIt is a plugin designed to manage browser instances using the Playwright automation library. It acts as a factory\nfor creating new browser instances and provides a unified interface for interacting with different browser types\n(chromium, firefox, and webkit). This class integrates configuration options for browser launches (headless mode,\nexecutable paths, sandboxing, ...). It also manages browser contexts and the number of pages open within each\nbrowser instance, ensuring that resource limits are respected." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/browsers/_playwright_browser_plugin.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 32 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "BrowserPlugin", + "target": "2802", + "type": "reference" + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2708, + "module": "browsers._playwright_browser_controller", + "name": "logger", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/browsers/_playwright_browser_controller.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 26 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The name of the automation library that the controller is using." + } + ] + }, + "flags": {}, + "groups": [], + "id": 2710, + "module": "browsers._browser_controller", + "name": "AUTOMATION_LIBRARY", + "parsedDocstring": { + "text": "The name of the automation library that the controller is using." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/browsers/_playwright_browser_controller.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 37 + } + ], + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + }, + "overwrites": { + "name": "BrowserController.AUTOMATION_LIBRARY", + "target": 2819, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2711, + "module": "browsers._playwright_browser_controller", + "name": "__init__", + "parsedDocstring": { + "text": "Initialize a new instance.\n", + "args": { + "browser": "The browser instance to control.", + "max_open_pages_per_browser": "The maximum number of pages that can be open at the same time.", + "use_incognito_pages": "By default pages share the same browser context. If set to True each page uses its\nown context that is destroyed once the page is closed or crashes.", + "header_generator": "An optional `HeaderGenerator` instance used to generate and manage HTTP headers for\nrequests made by the browser. By default, a predefined header generator is used. Set to `None` to\ndisable automatic header modifications.", + "fingerprint_generator": "An optional instance of implementation of `FingerprintGenerator` that is used\nto generate browser fingerprints together with consistent headers." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/browsers/_playwright_browser_controller.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 40 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n" + } + ] + }, + "flags": {}, + "id": 2712, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The browser instance to control." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2713, + "kind": 32768, + "kindString": "Parameter", + "name": "browser", + "type": { + "name": "Browser | PlaywrightPersistentBrowser", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Browser" + }, + { + "type": "reference", + "name": "PlaywrightPersistentBrowser", + "target": "2734" + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The maximum number of pages that can be open at the same time." + } + ] + }, + "defaultValue": "20", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2714, + "kind": 32768, + "kindString": "Parameter", + "name": "max_open_pages_per_browser", + "type": { + "name": "int", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "By default pages share the same browser context. If set to True each page uses its\nown context that is destroyed once the page is closed or crashes." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2715, + "kind": 32768, + "kindString": "Parameter", + "name": "use_incognito_pages", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "An optional `HeaderGenerator` instance used to generate and manage HTTP headers for\nrequests made by the browser. By default, a predefined header generator is used. Set to `None` to\ndisable automatic header modifications." + } + ] + }, + "defaultValue": "_DEFAULT_HEADER_GENERATOR", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2716, + "kind": 32768, + "kindString": "Parameter", + "name": "header_generator", + "type": { + "name": "HeaderGenerator | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "HeaderGenerator", + "target": "1974" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "An optional instance of implementation of `FingerprintGenerator` that is used\nto generate browser fingerprints together with consistent headers." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2717, + "kind": 32768, + "kindString": "Parameter", + "name": "fingerprint_generator", + "type": { + "name": "FingerprintGenerator | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "FingerprintGenerator", + "target": "1991" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the list of opened pages." + } + ] + }, + "decorations": [ + { + "name": "property" + }, + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2718, + "module": "browsers._browser_controller", + "name": "pages", + "parsedDocstring": { + "text": "Return the list of opened pages." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/browsers/_playwright_browser_controller.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 81 + } + ], + "type": { + "name": "list", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Page" + } + ], + "target": "866" + }, + "overwrites": { + "name": "BrowserController.pages", + "target": 2820, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the total number of pages opened since the browser was launched." + } + ] + }, + "decorations": [ + { + "name": "property" + }, + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2719, + "module": "browsers._browser_controller", + "name": "total_opened_pages", + "parsedDocstring": { + "text": "Return the total number of pages opened since the browser was launched." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/browsers/_playwright_browser_controller.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 86 + } + ], + "type": { + "name": "int", + "type": "reference" + }, + "overwrites": { + "name": "BrowserController.total_opened_pages", + "target": 2821, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the number of currently open pages." + } + ] + }, + "decorations": [ + { + "name": "property" + }, + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2720, + "module": "browsers._browser_controller", + "name": "pages_count", + "parsedDocstring": { + "text": "Return the number of currently open pages." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/browsers/_playwright_browser_controller.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 91 + } + ], + "type": { + "name": "int", + "type": "reference" + }, + "overwrites": { + "name": "BrowserController.pages_count", + "target": 2822, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the time when the last page was opened." + } + ] + }, + "decorations": [ + { + "name": "property" + }, + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2721, + "module": "browsers._browser_controller", + "name": "last_page_opened_at", + "parsedDocstring": { + "text": "Return the time when the last page was opened." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/browsers/_playwright_browser_controller.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 96 + } + ], + "type": { + "name": "datetime", + "type": "reference" + }, + "overwrites": { + "name": "BrowserController.last_page_opened_at", + "target": 2823, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the idle time of the browser controller." + } + ] + }, + "decorations": [ + { + "name": "property" + }, + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2722, + "module": "browsers._browser_controller", + "name": "idle_time", + "parsedDocstring": { + "text": "Return the idle time of the browser controller." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/browsers/_playwright_browser_controller.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 101 + } + ], + "type": { + "name": "timedelta", + "type": "reference" + }, + "overwrites": { + "name": "BrowserController.idle_time", + "target": 2824, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return if the browser has free capacity to open a new page." + } + ] + }, + "decorations": [ + { + "name": "property" + }, + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2723, + "module": "browsers._browser_controller", + "name": "has_free_capacity", + "parsedDocstring": { + "text": "Return if the browser has free capacity to open a new page." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/browsers/_playwright_browser_controller.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 106 + } + ], + "type": { + "name": "bool", + "type": "reference" + }, + "overwrites": { + "name": "BrowserController.has_free_capacity", + "target": 2825, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return if the browser is closed." + } + ] + }, + "decorations": [ + { + "name": "property" + }, + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2724, + "module": "browsers._browser_controller", + "name": "is_browser_connected", + "parsedDocstring": { + "text": "Return if the browser is closed." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/browsers/_playwright_browser_controller.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 111 + } + ], + "type": { + "name": "bool", + "type": "reference" + }, + "overwrites": { + "name": "BrowserController.is_browser_connected", + "target": 2826, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the type of the browser." + } + ] + }, + "decorations": [ + { + "name": "property" + }, + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2725, + "module": "browsers._browser_controller", + "name": "browser_type", + "parsedDocstring": { + "text": "Return the type of the browser." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/browsers/_playwright_browser_controller.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 116 + } + ], + "type": { + "name": "BrowserType", + "type": "reference", + "target": "2677" + }, + "overwrites": { + "name": "BrowserController.browser_type", + "target": 2827, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a new page with the given context options.\n" + } + ] + }, + "decorations": [ + { + "name": "override" + } + ], + "flags": {}, + "groups": [], + "id": 2726, + "module": "browsers._playwright_browser_controller", + "name": "new_page", + "parsedDocstring": { + "text": "Create a new page with the given context options.\n", + "args": { + "browser_new_context_options": "Keyword arguments to pass to the browser new context method. These options\nare provided directly to Playwright's `browser.new_context` method. For more details, refer to the\nPlaywright documentation: https://playwright.dev/python/docs/api/class-browser#browser-new-context.", + "proxy_info": "The proxy configuration to use for the new page.\n" + }, + "returns": "Page: The newly created page.\n" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/browsers/_playwright_browser_controller.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 120 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "Page: The newly created page.\n" + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Create a new page with the given context options.\n" + } + ] + }, + "flags": {}, + "id": 2727, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "new_page", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Keyword arguments to pass to the browser new context method. These options\nare provided directly to Playwright's `browser.new_context` method. For more details, refer to the\nPlaywright documentation: https://playwright.dev/python/docs/api/class-browser#browser-new-context." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 2728, + "kind": 32768, + "kindString": "Parameter", + "name": "browser_new_context_options", + "type": { + "name": "Mapping[str, Any] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Mapping", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Any" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The proxy configuration to use for the new page.\n" + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 2729, + "kind": 32768, + "kindString": "Parameter", + "name": "proxy_info", + "type": { + "name": "ProxyInfo | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ProxyInfo", + "target": "15" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "Page", + "type": "reference" + }, + "overwrites": { + "name": "BrowserController.new_page", + "target": 2828, + "type": "reference" + } + } + ], + "overwrites": { + "name": "BrowserController.new_page", + "target": 2828, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Close the browser.\n" + } + ] + }, + "decorations": [ + { + "name": "override" + } + ], + "flags": {}, + "groups": [], + "id": 2730, + "module": "browsers._playwright_browser_controller", + "name": "close", + "parsedDocstring": { + "text": "Close the browser.\n", + "args": { + "force": "Whether to force close all open pages before closing the browser.\n" + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/browsers/_playwright_browser_controller.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 175 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Close the browser.\n" + } + ] + }, + "flags": {}, + "id": 2731, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "close", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Whether to force close all open pages before closing the browser.\n" + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2732, + "kind": 32768, + "kindString": "Parameter", + "name": "force", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "BrowserController.close", + "target": 2832, + "type": "reference" + } + } + ], + "overwrites": { + "name": "BrowserController.close", + "target": 2832, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Controller for managing Playwright browser instances and their pages.\n\nIt provides methods to control browser instances, manage their pages, and handle context-specific\nconfigurations. It enforces limits on the number of open pages and tracks their state." + } + ] + }, + "decorations": [ + { + "args": "('Classes')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 2711, + 2730, + 2726 + ], + "title": "Methods" + }, + { + "children": [ + 2710, + 2725, + 2723, + 2722, + 2724, + 2721, + 2718, + 2720, + 2719 + ], + "title": "Properties" + } + ], + "id": 2709, + "module": "browsers._playwright_browser_controller", + "name": "PlaywrightBrowserController", + "parsedDocstring": { + "text": "Controller for managing Playwright browser instances and their pages.\n\nIt provides methods to control browser instances, manage their pages, and handle context-specific\nconfigurations. It enforces limits on the number of open pages and tracks their state." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/browsers/_playwright_browser_controller.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 30 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "BrowserController", + "target": "2818", + "type": "reference" + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2733, + "module": "browsers._playwright_browser", + "name": "logger", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/browsers/_playwright_browser.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 16 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2735, + "module": "browsers._playwright_browser", + "name": "__init__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/browsers/_playwright_browser.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 29 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 2736, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2737, + "kind": 32768, + "kindString": "Parameter", + "name": "browser_type", + "type": { + "name": "BrowserType", + "type": "reference", + "target": "2677" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2738, + "kind": 32768, + "kindString": "Parameter", + "name": "user_data_dir", + "type": { + "name": "str | Path | None", + "type": "union", + "types": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Path" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2739, + "kind": 32768, + "kindString": "Parameter", + "name": "browser_launch_options", + "type": { + "name": "dict", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Any" + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 2740, + "module": "browsers._playwright_browser", + "name": "browser_type", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/browsers/_playwright_browser.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 44 + } + ], + "type": { + "name": "BrowserType", + "type": "reference", + "target": "2677" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 2741, + "module": "browsers._playwright_browser", + "name": "contexts", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/browsers/_playwright_browser.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 48 + } + ], + "type": { + "name": "list", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "BrowserContext" + } + ], + "target": "866" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2742, + "module": "browsers._playwright_browser", + "name": "is_connected", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/browsers/_playwright_browser.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 51 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 2743, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "is_connected", + "parameters": [], + "type": { + "name": "bool", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create persistent context instead of regular one. Merge launch options with context options." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2744, + "module": "browsers._playwright_browser", + "name": "new_context", + "parsedDocstring": { + "text": "Create persistent context instead of regular one. Merge launch options with context options." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/browsers/_playwright_browser.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 54 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create persistent context instead of regular one. Merge launch options with context options." + } + ] + }, + "flags": {}, + "id": 2745, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "new_context", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2746, + "kind": 32768, + "kindString": "Parameter", + "name": "context_options", + "type": { + "name": "Any", + "type": "reference" + } + } + ], + "type": { + "name": "BrowserContext", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Close browser by closing its context." + } + ] + }, + "decorations": [ + { + "name": "override" + } + ], + "flags": {}, + "groups": [], + "id": 2747, + "module": "browsers._playwright_browser", + "name": "close", + "parsedDocstring": { + "text": "Close browser by closing its context." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/browsers/_playwright_browser.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 81 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Close browser by closing its context." + } + ] + }, + "flags": {}, + "id": 2748, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "close", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2749, + "kind": 32768, + "kindString": "Parameter", + "name": "kwargs", + "type": { + "name": "Any", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "name": "property" + }, + { + "name": "override" + } + ], + "flags": {}, + "groups": [], + "id": 2750, + "module": "browsers._playwright_browser", + "name": "version", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/browsers/_playwright_browser.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 92 + } + ], + "type": { + "name": "str", + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2751, + "module": "browsers._playwright_browser", + "name": "new_page", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/browsers/_playwright_browser.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 95 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 2752, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "new_page", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2753, + "kind": 32768, + "kindString": "Parameter", + "name": "kwargs", + "type": { + "name": "Any", + "type": "reference" + } + } + ], + "type": { + "name": "Page", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "name": "override" + } + ], + "flags": {}, + "groups": [], + "id": 2754, + "module": "browsers._playwright_browser", + "name": "new_browser_cdp_session", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/browsers/_playwright_browser.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 99 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 2755, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "new_browser_cdp_session", + "parameters": [], + "type": { + "name": "CDPSession", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2756, + "module": "browsers._playwright_browser", + "name": "start_tracing", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/browsers/_playwright_browser.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 102 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 2757, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "start_tracing", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2758, + "kind": 32768, + "kindString": "Parameter", + "name": "kwargs", + "type": { + "name": "Any", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2759, + "module": "browsers._playwright_browser", + "name": "stop_tracing", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/browsers/_playwright_browser.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 105 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 2760, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "stop_tracing", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2761, + "kind": 32768, + "kindString": "Parameter", + "name": "kwargs", + "type": { + "name": "Any", + "type": "reference" + } + } + ], + "type": { + "name": "bytes", + "type": "reference", + "target": "3048" + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A wrapper for Playwright's `Browser` that operates with a persistent context.\n\nIt utilizes Playwright's persistent browser context feature, maintaining user data across sessions.\nWhile it follows the same interface as Playwright's `Browser` class, there is no abstract base class\nenforcing this. There is a limitation that only a single persistent context is allowed." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 2735, + 2747, + 2742, + 2754, + 2744, + 2751, + 2756, + 2759 + ], + "title": "Methods" + }, + { + "children": [ + 2740, + 2741, + 2750 + ], + "title": "Properties" + } + ], + "id": 2734, + "module": "browsers._playwright_browser", + "name": "PlaywrightPersistentBrowser", + "parsedDocstring": { + "text": "A wrapper for Playwright's `Browser` that operates with a persistent context.\n\nIt utilizes Playwright's persistent browser context feature, maintaining user data across sessions.\nWhile it follows the same interface as Playwright's `Browser` class, there is no abstract base class\nenforcing this. There is a limitation that only a single persistent context is allowed." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/browsers/_playwright_browser.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 19 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2762, + "module": "browsers._browser_pool", + "name": "logger", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/browsers/_browser_pool.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 30 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2764, + "module": "browsers._browser_pool", + "name": "__init__", + "parsedDocstring": { + "text": "Initialize a new instance.\n", + "args": { + "plugins": "Browser plugins serve as wrappers around various browser automation libraries,\nproviding a consistent interface across different libraries.", + "operation_timeout": "Operations of the underlying automation libraries, such as launching a browser\nor opening a new page, can sometimes get stuck. To prevent `BrowserPool` from becoming unresponsive,\nwe add a timeout to these operations.", + "browser_inactive_threshold": "The period of inactivity after which a browser is considered as inactive.", + "identify_inactive_browsers_interval": "The period of inactivity after which a browser is considered\nas retired.", + "close_inactive_browsers_interval": "The interval at which the pool checks for inactive browsers\nand closes them. The browser is considered as inactive if it has no active pages and has been idle\nfor the specified period. The browser is considered as retired if it has no active pages and has total\npages count greater than or equal to `retire_browser_after_page_count`.", + "retire_browser_after_page_count": "The maximum number of processed pages after which the browser is considered\nas retired." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/browsers/_browser_pool.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 48 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n" + } + ] + }, + "flags": {}, + "id": 2765, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Browser plugins serve as wrappers around various browser automation libraries,\nproviding a consistent interface across different libraries." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 2766, + "kind": 32768, + "kindString": "Parameter", + "name": "plugins", + "type": { + "name": "Sequence[BrowserPlugin] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Sequence", + "typeArguments": [ + { + "type": "reference", + "name": "BrowserPlugin", + "target": "2802" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Operations of the underlying automation libraries, such as launching a browser\nor opening a new page, can sometimes get stuck. To prevent `BrowserPool` from becoming unresponsive,\nwe add a timeout to these operations." + } + ] + }, + "defaultValue": "timedelta(seconds=15)", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2767, + "kind": 32768, + "kindString": "Parameter", + "name": "operation_timeout", + "type": { + "name": "timedelta", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The period of inactivity after which a browser is considered as inactive." + } + ] + }, + "defaultValue": "timedelta(seconds=10)", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2768, + "kind": 32768, + "kindString": "Parameter", + "name": "browser_inactive_threshold", + "type": { + "name": "timedelta", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The period of inactivity after which a browser is considered\nas retired." + } + ] + }, + "defaultValue": "timedelta(seconds=20)", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2769, + "kind": 32768, + "kindString": "Parameter", + "name": "identify_inactive_browsers_interval", + "type": { + "name": "timedelta", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The interval at which the pool checks for inactive browsers\nand closes them. The browser is considered as inactive if it has no active pages and has been idle\nfor the specified period. The browser is considered as retired if it has no active pages and has total\npages count greater than or equal to `retire_browser_after_page_count`." + } + ] + }, + "defaultValue": "timedelta(seconds=30)", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2770, + "kind": 32768, + "kindString": "Parameter", + "name": "close_inactive_browsers_interval", + "type": { + "name": "timedelta", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The maximum number of processed pages after which the browser is considered\nas retired." + } + ] + }, + "defaultValue": "100", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2771, + "kind": 32768, + "kindString": "Parameter", + "name": "retire_browser_after_page_count", + "type": { + "name": "int", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance with a single `PlaywrightBrowserPlugin` configured with the provided options.\n" + } + ] + }, + "decorations": [ + { + "name": "classmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2772, + "module": "browsers._browser_pool", + "name": "with_default_plugin", + "parsedDocstring": { + "text": "Initialize a new instance with a single `PlaywrightBrowserPlugin` configured with the provided options.\n", + "args": { + "browser_type": "The type of browser to launch ('chromium', 'firefox', or 'webkit').", + "user_data_dir": "Path to a user data directory, which stores browser session data like cookies\nand local storage.", + "browser_launch_options": "Keyword arguments to pass to the browser launch method. These options are provided\ndirectly to Playwright's `browser_type.launch` method. For more details, refer to the Playwright\ndocumentation: https://playwright.dev/python/docs/api/class-browsertype#browser-type-launch.", + "browser_new_context_options": "Keyword arguments to pass to the browser new context method. These options\nare provided directly to Playwright's `browser.new_context` method. For more details, refer to the\nPlaywright documentation: https://playwright.dev/python/docs/api/class-browser#browser-new-context.", + "headless": "Whether to run the browser in headless mode.", + "fingerprint_generator": "An optional instance of implementation of `FingerprintGenerator` that is used\nto generate browser fingerprints together with consistent headers.", + "use_incognito_pages": "By default pages share the same browser context. If set to True each page uses its\nown context that is destroyed once the page is closed or crashes.", + "kwargs": "Additional arguments for default constructor." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/browsers/_browser_pool.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 106 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance with a single `PlaywrightBrowserPlugin` configured with the provided options.\n" + } + ] + }, + "flags": {}, + "id": 2773, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "with_default_plugin", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The type of browser to launch ('chromium', 'firefox', or 'webkit')." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2774, + "kind": 32768, + "kindString": "Parameter", + "name": "browser_type", + "type": { + "name": "BrowserType | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "BrowserType", + "target": "2677" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Path to a user data directory, which stores browser session data like cookies\nand local storage." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2775, + "kind": 32768, + "kindString": "Parameter", + "name": "user_data_dir", + "type": { + "name": "str | Path | None", + "type": "union", + "types": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Path" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Keyword arguments to pass to the browser launch method. These options are provided\ndirectly to Playwright's `browser_type.launch` method. For more details, refer to the Playwright\ndocumentation: https://playwright.dev/python/docs/api/class-browsertype#browser-type-launch." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2776, + "kind": 32768, + "kindString": "Parameter", + "name": "browser_launch_options", + "type": { + "name": "Mapping[str, Any] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Mapping", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Any" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Keyword arguments to pass to the browser new context method. These options\nare provided directly to Playwright's `browser.new_context` method. For more details, refer to the\nPlaywright documentation: https://playwright.dev/python/docs/api/class-browser#browser-new-context." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2777, + "kind": 32768, + "kindString": "Parameter", + "name": "browser_new_context_options", + "type": { + "name": "Mapping[str, Any] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Mapping", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Any" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Whether to run the browser in headless mode." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2778, + "kind": 32768, + "kindString": "Parameter", + "name": "headless", + "type": { + "name": "bool | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "bool" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "An optional instance of implementation of `FingerprintGenerator` that is used\nto generate browser fingerprints together with consistent headers." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2779, + "kind": 32768, + "kindString": "Parameter", + "name": "fingerprint_generator", + "type": { + "name": "FingerprintGenerator | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "FingerprintGenerator", + "target": "1991" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "By default pages share the same browser context. If set to True each page uses its\nown context that is destroyed once the page is closed or crashes." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2780, + "kind": 32768, + "kindString": "Parameter", + "name": "use_incognito_pages", + "type": { + "name": "bool | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "bool" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Additional arguments for default constructor." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2781, + "kind": 32768, + "kindString": "Parameter", + "name": "kwargs", + "type": { + "name": "Any", + "type": "reference" + } + } + ], + "type": { + "name": "BrowserPool", + "type": "reference", + "target": "2763" + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the browser plugins." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 2782, + "module": "browsers._browser_pool", + "name": "plugins", + "parsedDocstring": { + "text": "Return the browser plugins." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/browsers/_browser_pool.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 160 + } + ], + "type": { + "name": "Sequence", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "BrowserPlugin", + "target": "2802" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the active browsers in the pool." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 2783, + "module": "browsers._browser_pool", + "name": "active_browsers", + "parsedDocstring": { + "text": "Return the active browsers in the pool." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/browsers/_browser_pool.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 165 + } + ], + "type": { + "name": "Sequence", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "BrowserController", + "target": "2818" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the inactive browsers in the pool." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 2784, + "module": "browsers._browser_pool", + "name": "inactive_browsers", + "parsedDocstring": { + "text": "Return the inactive browsers in the pool." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/browsers/_browser_pool.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 170 + } + ], + "type": { + "name": "Sequence", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "BrowserController", + "target": "2818" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the pages in the pool." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 2785, + "module": "browsers._browser_pool", + "name": "pages", + "parsedDocstring": { + "text": "Return the pages in the pool." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/browsers/_browser_pool.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 175 + } + ], + "type": { + "name": "Mapping", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "CrawleePage", + "target": "2678" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the total number of pages opened since the browser pool was launched." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 2786, + "module": "browsers._browser_pool", + "name": "total_pages_count", + "parsedDocstring": { + "text": "Return the total number of pages opened since the browser pool was launched." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/browsers/_browser_pool.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 180 + } + ], + "type": { + "name": "int", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Indicate whether the context is active." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 2787, + "module": "browsers._browser_pool", + "name": "active", + "parsedDocstring": { + "text": "Indicate whether the context is active." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/browsers/_browser_pool.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 185 + } + ], + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Enter the context manager and initialize all browser plugins.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2788, + "module": "browsers._browser_pool", + "name": "__aenter__", + "parsedDocstring": { + "text": "Enter the context manager and initialize all browser plugins.\n" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/browsers/_browser_pool.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 189 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Enter the context manager and initialize all browser plugins.\n" + } + ] + }, + "flags": {}, + "id": 2789, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "__aenter__", + "parameters": [], + "type": { + "name": "BrowserPool", + "type": "reference", + "target": "2763" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Exit the context manager and close all browser plugins.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2790, + "module": "browsers._browser_pool", + "name": "__aexit__", + "parsedDocstring": { + "text": "Exit the context manager and close all browser plugins.\n" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/browsers/_browser_pool.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 213 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Exit the context manager and close all browser plugins.\n" + } + ] + }, + "flags": {}, + "id": 2791, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "__aexit__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2792, + "kind": 32768, + "kindString": "Parameter", + "name": "exc_type", + "type": { + "name": "type[BaseException] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "type", + "typeArguments": [ + { + "type": "reference", + "name": "BaseException" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2793, + "kind": 32768, + "kindString": "Parameter", + "name": "exc_value", + "type": { + "name": "BaseException | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "BaseException" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2794, + "kind": 32768, + "kindString": "Parameter", + "name": "exc_traceback", + "type": { + "name": "TracebackType | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "TracebackType" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Open a new page in a browser using the specified or a random browser plugin.\n" + } + ] + }, + "decorations": [ + { + "name": "ensure_context" + } + ], + "flags": {}, + "groups": [], + "id": 2795, + "module": "browsers._browser_pool", + "name": "new_page", + "parsedDocstring": { + "text": "Open a new page in a browser using the specified or a random browser plugin.\n", + "args": { + "page_id": "The ID to assign to the new page. If not provided, a random ID is generated.", + "browser_plugin": "browser_plugin: The browser plugin to use for creating the new page.\nIf not provided, the next plugin in the rotation is used.", + "proxy_info": "The proxy configuration to use for the new page.\n" + }, + "returns": "The newly created browser page." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/browsers/_browser_pool.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 241 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "The newly created browser page." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Open a new page in a browser using the specified or a random browser plugin.\n" + } + ] + }, + "flags": {}, + "id": 2796, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "new_page", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The ID to assign to the new page. If not provided, a random ID is generated." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2797, + "kind": 32768, + "kindString": "Parameter", + "name": "page_id", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "browser_plugin: The browser plugin to use for creating the new page.\nIf not provided, the next plugin in the rotation is used." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2798, + "kind": 32768, + "kindString": "Parameter", + "name": "browser_plugin", + "type": { + "name": "BrowserPlugin | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "BrowserPlugin", + "target": "2802" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The proxy configuration to use for the new page.\n" + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2799, + "kind": 32768, + "kindString": "Parameter", + "name": "proxy_info", + "type": { + "name": "ProxyInfo | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ProxyInfo", + "target": "15" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "CrawleePage", + "type": "reference", + "target": "2678" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a new page with each browser plugin in the pool.\n\nThis method is useful for running scripts in multiple environments simultaneously, typically for testing\nor website analysis. Each page is created using a different browser plugin, allowing you to interact\nwith various browser types concurrently.\n" + } + ] + }, + "decorations": [ + { + "name": "ensure_context" + } + ], + "flags": {}, + "groups": [], + "id": 2800, + "module": "browsers._browser_pool", + "name": "new_page_with_each_plugin", + "parsedDocstring": { + "text": "Create a new page with each browser plugin in the pool.\n\nThis method is useful for running scripts in multiple environments simultaneously, typically for testing\nor website analysis. Each page is created using a different browser plugin, allowing you to interact\nwith various browser types concurrently.\n", + "returns": "A list of newly created pages, one for each plugin in the pool." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/browsers/_browser_pool.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 271 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "A list of newly created pages, one for each plugin in the pool." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Create a new page with each browser plugin in the pool.\n\nThis method is useful for running scripts in multiple environments simultaneously, typically for testing\nor website analysis. Each page is created using a different browser plugin, allowing you to interact\nwith various browser types concurrently.\n" + } + ] + }, + "flags": {}, + "id": 2801, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "new_page_with_each_plugin", + "parameters": [], + "type": { + "name": "Sequence", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "CrawleePage", + "target": "2678" + } + ] + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Manage a pool of browsers and pages, handling their lifecycle and resource allocation.\n\nThe `BrowserPool` is responsible for opening and closing browsers, managing pages within those browsers,\nand handling the overall lifecycle of these resources. It provides flexible configuration via\nconstructor options, which include various hooks that allow for the insertion of custom behavior\nat different stages of the browser and page lifecycles.\n\nThe browsers in the pool can be in one of three states: active, inactive, or closed." + } + ] + }, + "decorations": [ + { + "args": "('Classes')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 2788, + 2790, + 2764, + 2795, + 2800, + 2772 + ], + "title": "Methods" + }, + { + "children": [ + 2787, + 2783, + 2784, + 2785, + 2782, + 2786 + ], + "title": "Properties" + } + ], + "id": 2763, + "module": "browsers._browser_pool", + "name": "BrowserPool", + "parsedDocstring": { + "text": "Manage a pool of browsers and pages, handling their lifecycle and resource allocation.\n\nThe `BrowserPool` is responsible for opening and closing browsers, managing pages within those browsers,\nand handling the overall lifecycle of these resources. It provides flexible configuration via\nconstructor options, which include various hooks that allow for the insertion of custom behavior\nat different stages of the browser and page lifecycles.\n\nThe browsers in the pool can be in one of three states: active, inactive, or closed." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/browsers/_browser_pool.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 34 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The name of the automation library that the plugin is managing." + } + ] + }, + "flags": {}, + "groups": [], + "id": 2803, + "module": "browsers._browser_plugin", + "name": "AUTOMATION_LIBRARY", + "parsedDocstring": { + "text": "The name of the automation library that the plugin is managing." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/browsers/_browser_plugin.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 23 + } + ], + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Indicate whether the context is active." + } + ] + }, + "decorations": [ + { + "name": "property" + }, + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2804, + "module": "browsers._browser_plugin", + "name": "active", + "parsedDocstring": { + "text": "Indicate whether the context is active." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/browsers/_browser_plugin.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 28 + } + ], + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the browser type name." + } + ] + }, + "decorations": [ + { + "name": "property" + }, + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2805, + "module": "browsers._browser_plugin", + "name": "browser_type", + "parsedDocstring": { + "text": "Return the browser type name." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/browsers/_browser_plugin.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 33 + } + ], + "type": { + "name": "BrowserType", + "type": "reference", + "target": "2677" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the options for the `browser.launch` method.\n\nKeyword arguments to pass to the browser launch method. These options are provided directly to Playwright's\n`browser_type.launch` method. For more details, refer to the Playwright documentation:\nhttps://playwright.dev/python/docs/api/class-browsertype#browser-type-launch." + } + ] + }, + "decorations": [ + { + "name": "property" + }, + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2806, + "module": "browsers._browser_plugin", + "name": "browser_launch_options", + "parsedDocstring": { + "text": "Return the options for the `browser.launch` method.\n\nKeyword arguments to pass to the browser launch method. These options are provided directly to Playwright's\n`browser_type.launch` method. For more details, refer to the Playwright documentation:\nhttps://playwright.dev/python/docs/api/class-browsertype#browser-type-launch." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/browsers/_browser_plugin.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 38 + } + ], + "type": { + "name": "Mapping", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Any" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the options for the `browser.new_context` method.\n\nKeyword arguments to pass to the browser new context method. These options are provided directly to Playwright's\n`browser.new_context` method. For more details, refer to the Playwright documentation:\nhttps://playwright.dev/python/docs/api/class-browser#browser-new-context." + } + ] + }, + "decorations": [ + { + "name": "property" + }, + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2807, + "module": "browsers._browser_plugin", + "name": "browser_new_context_options", + "parsedDocstring": { + "text": "Return the options for the `browser.new_context` method.\n\nKeyword arguments to pass to the browser new context method. These options are provided directly to Playwright's\n`browser.new_context` method. For more details, refer to the Playwright documentation:\nhttps://playwright.dev/python/docs/api/class-browser#browser-new-context." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/browsers/_browser_plugin.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 48 + } + ], + "type": { + "name": "Mapping", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Any" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the maximum number of pages that can be opened in a single browser." + } + ] + }, + "decorations": [ + { + "name": "property" + }, + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2808, + "module": "browsers._browser_plugin", + "name": "max_open_pages_per_browser", + "parsedDocstring": { + "text": "Return the maximum number of pages that can be opened in a single browser." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/browsers/_browser_plugin.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 58 + } + ], + "type": { + "name": "int", + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Enter the context manager and initialize the browser plugin.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2809, + "module": "browsers._browser_plugin", + "name": "__aenter__", + "parsedDocstring": { + "text": "Enter the context manager and initialize the browser plugin.\n" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/browsers/_browser_plugin.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 62 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Enter the context manager and initialize the browser plugin.\n" + } + ] + }, + "flags": {}, + "id": 2810, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "__aenter__", + "parameters": [], + "type": { + "name": "BrowserPlugin", + "type": "reference", + "target": "2802" + }, + "overwrites": { + "name": "BrowserPlugin.__aenter__", + "target": 2809, + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Exit the context manager and close the browser plugin.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2811, + "module": "browsers._browser_plugin", + "name": "__aexit__", + "parsedDocstring": { + "text": "Exit the context manager and close the browser plugin.\n" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/browsers/_browser_plugin.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 70 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Exit the context manager and close the browser plugin.\n" + } + ] + }, + "flags": {}, + "id": 2812, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "__aexit__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2813, + "kind": 32768, + "kindString": "Parameter", + "name": "exc_type", + "type": { + "name": "type[BaseException] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "type", + "typeArguments": [ + { + "type": "reference", + "name": "BaseException" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2814, + "kind": 32768, + "kindString": "Parameter", + "name": "exc_value", + "type": { + "name": "BaseException | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "BaseException" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2815, + "kind": 32768, + "kindString": "Parameter", + "name": "exc_traceback", + "type": { + "name": "TracebackType | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "TracebackType" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "BrowserPlugin.__aexit__", + "target": 2811, + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a new browser instance.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2816, + "module": "browsers._browser_plugin", + "name": "new_browser", + "parsedDocstring": { + "text": "Create a new browser instance.\n", + "returns": "A new browser instance wrapped in a controller." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/browsers/_browser_plugin.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 83 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "A new browser instance wrapped in a controller." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Create a new browser instance.\n" + } + ] + }, + "flags": {}, + "id": 2817, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "new_browser", + "parameters": [], + "type": { + "name": "BrowserController", + "type": "reference", + "target": "2818" + }, + "overwrites": { + "name": "BrowserPlugin.new_browser", + "target": 2816, + "type": "reference" + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "An abstract base class for browser plugins.\n\nBrowser plugins act as wrappers around browser automation tools like Playwright,\nproviding a unified interface for interacting with browsers." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 2809, + 2811, + 2816 + ], + "title": "Methods" + }, + { + "children": [ + 2804, + 2803, + 2806, + 2807, + 2805, + 2808 + ], + "title": "Properties" + } + ], + "id": 2802, + "module": "browsers._browser_plugin", + "name": "BrowserPlugin", + "parsedDocstring": { + "text": "An abstract base class for browser plugins.\n\nBrowser plugins act as wrappers around browser automation tools like Playwright,\nproviding a unified interface for interacting with browsers." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/browsers/_browser_plugin.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 16 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedBy": [ + { + "name": "PlaywrightBrowserPlugin", + "target": "2683", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The name of the automation library that the controller is using." + } + ] + }, + "flags": {}, + "groups": [], + "id": 2819, + "module": "browsers._browser_controller", + "name": "AUTOMATION_LIBRARY", + "parsedDocstring": { + "text": "The name of the automation library that the controller is using." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/browsers/_browser_controller.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 21 + } + ], + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the list of opened pages." + } + ] + }, + "decorations": [ + { + "name": "property" + }, + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2820, + "module": "browsers._browser_controller", + "name": "pages", + "parsedDocstring": { + "text": "Return the list of opened pages." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/browsers/_browser_controller.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 26 + } + ], + "type": { + "name": "list", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Page" + } + ], + "target": "866" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the total number of pages opened since the browser was launched." + } + ] + }, + "decorations": [ + { + "name": "property" + }, + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2821, + "module": "browsers._browser_controller", + "name": "total_opened_pages", + "parsedDocstring": { + "text": "Return the total number of pages opened since the browser was launched." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/browsers/_browser_controller.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 31 + } + ], + "type": { + "name": "int", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the number of currently open pages." + } + ] + }, + "decorations": [ + { + "name": "property" + }, + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2822, + "module": "browsers._browser_controller", + "name": "pages_count", + "parsedDocstring": { + "text": "Return the number of currently open pages." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/browsers/_browser_controller.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 36 + } + ], + "type": { + "name": "int", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the time when the last page was opened." + } + ] + }, + "decorations": [ + { + "name": "property" + }, + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2823, + "module": "browsers._browser_controller", + "name": "last_page_opened_at", + "parsedDocstring": { + "text": "Return the time when the last page was opened." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/browsers/_browser_controller.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 41 + } + ], + "type": { + "name": "datetime", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the idle time of the browser controller." + } + ] + }, + "decorations": [ + { + "name": "property" + }, + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2824, + "module": "browsers._browser_controller", + "name": "idle_time", + "parsedDocstring": { + "text": "Return the idle time of the browser controller." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/browsers/_browser_controller.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 46 + } + ], + "type": { + "name": "timedelta", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return if the browser has free capacity to open a new page." + } + ] + }, + "decorations": [ + { + "name": "property" + }, + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2825, + "module": "browsers._browser_controller", + "name": "has_free_capacity", + "parsedDocstring": { + "text": "Return if the browser has free capacity to open a new page." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/browsers/_browser_controller.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 51 + } + ], + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return if the browser is closed." + } + ] + }, + "decorations": [ + { + "name": "property" + }, + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2826, + "module": "browsers._browser_controller", + "name": "is_browser_connected", + "parsedDocstring": { + "text": "Return if the browser is closed." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/browsers/_browser_controller.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 56 + } + ], + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the type of the browser." + } + ] + }, + "decorations": [ + { + "name": "property" + }, + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2827, + "module": "browsers._browser_controller", + "name": "browser_type", + "parsedDocstring": { + "text": "Return the type of the browser." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/browsers/_browser_controller.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 61 + } + ], + "type": { + "name": "BrowserType", + "type": "reference", + "target": "2677" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a new page with the given context options.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2828, + "module": "browsers._browser_controller", + "name": "new_page", + "parsedDocstring": { + "text": "Create a new page with the given context options.\n", + "args": { + "browser_new_context_options": "Keyword arguments to pass to the browser new context method. These options\nare provided directly to Playwright's `browser.new_context` method. For more details, refer to the\nPlaywright documentation: https://playwright.dev/python/docs/api/class-browser#browser-new-context.", + "proxy_info": "The proxy configuration to use for the new page.\n" + }, + "returns": "Page: The newly created page.\n" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/browsers/_browser_controller.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 65 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "Page: The newly created page.\n" + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Create a new page with the given context options.\n" + } + ] + }, + "flags": {}, + "id": 2829, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "new_page", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Keyword arguments to pass to the browser new context method. These options\nare provided directly to Playwright's `browser.new_context` method. For more details, refer to the\nPlaywright documentation: https://playwright.dev/python/docs/api/class-browser#browser-new-context." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 2830, + "kind": 32768, + "kindString": "Parameter", + "name": "browser_new_context_options", + "type": { + "name": "Mapping[str, Any] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Mapping", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Any" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The proxy configuration to use for the new page.\n" + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 2831, + "kind": 32768, + "kindString": "Parameter", + "name": "proxy_info", + "type": { + "name": "ProxyInfo | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ProxyInfo", + "target": "15" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "Page", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Close the browser.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2832, + "module": "browsers._browser_controller", + "name": "close", + "parsedDocstring": { + "text": "Close the browser.\n", + "args": { + "force": "Whether to force close all open pages before closing the browser.\n" + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/browsers/_browser_controller.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 86 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Close the browser.\n" + } + ] + }, + "flags": {}, + "id": 2833, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "close", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Whether to force close all open pages before closing the browser.\n" + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2834, + "kind": 32768, + "kindString": "Parameter", + "name": "force", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "An abstract base class for managing browser instance and their pages." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 2832, + 2828 + ], + "title": "Methods" + }, + { + "children": [ + 2819, + 2827, + 2825, + 2824, + 2826, + 2823, + 2820, + 2822, + 2821 + ], + "title": "Properties" + } + ], + "id": 2818, + "module": "browsers._browser_controller", + "name": "BrowserController", + "parsedDocstring": { + "text": "An abstract base class for managing browser instance and their pages." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/browsers/_browser_controller.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 18 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedBy": [ + { + "name": "PlaywrightBrowserController", + "target": "2709", + "type": "reference" + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return `True` for 4xx status codes, `False` otherwise." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2835, + "module": "_utils.web", + "name": "is_status_code_client_error", + "parsedDocstring": { + "text": "Return `True` for 4xx status codes, `False` otherwise." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/web.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 4 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return `True` for 4xx status codes, `False` otherwise." + } + ] + }, + "flags": {}, + "id": 2836, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "is_status_code_client_error", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2837, + "kind": 32768, + "kindString": "Parameter", + "name": "value", + "type": { + "name": "int", + "type": "reference" + } + } + ], + "type": { + "name": "bool", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return `True` for 5xx status codes, `False` otherwise." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2838, + "module": "_utils.web", + "name": "is_status_code_server_error", + "parsedDocstring": { + "text": "Return `True` for 5xx status codes, `False` otherwise." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/web.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 9 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return `True` for 5xx status codes, `False` otherwise." + } + ] + }, + "flags": {}, + "id": 2839, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "is_status_code_server_error", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2840, + "kind": 32768, + "kindString": "Parameter", + "name": "value", + "type": { + "name": "int", + "type": "reference" + } + } + ], + "type": { + "name": "bool", + "type": "reference" + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2841, + "module": "_utils.wait", + "name": "T", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/wait.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 12 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Wait for an async operation to complete.\n\nIf the wait times out, `TimeoutError` is raised and the future is cancelled.\nOptionally retry on error.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2842, + "module": "_utils.wait", + "name": "wait_for", + "parsedDocstring": { + "text": "Wait for an async operation to complete.\n\nIf the wait times out, `TimeoutError` is raised and the future is cancelled.\nOptionally retry on error.\n", + "args": { + "operation": "A function that returns the future to wait for.", + "timeout": "How long should we wait before cancelling the future.", + "timeout_message": "Message to be included in the `TimeoutError` in case of timeout.", + "max_retries": "How many times should the operation be attempted.", + "logger": "Used to report information about retries as they happen." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/wait.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 15 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Wait for an async operation to complete.\n\nIf the wait times out, `TimeoutError` is raised and the future is cancelled.\nOptionally retry on error.\n" + } + ] + }, + "flags": {}, + "id": 2843, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "wait_for", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A function that returns the future to wait for." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2844, + "kind": 32768, + "kindString": "Parameter", + "name": "operation", + "type": { + "name": "Callable", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "[]" + }, + { + "type": "reference", + "name": "Awaitable", + "typeArguments": [ + { + "type": "reference", + "name": "T", + "target": "117" + } + ] + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "How long should we wait before cancelling the future." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2845, + "kind": 32768, + "kindString": "Parameter", + "name": "timeout", + "type": { + "name": "timedelta", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Message to be included in the `TimeoutError` in case of timeout." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2846, + "kind": 32768, + "kindString": "Parameter", + "name": "timeout_message", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "How many times should the operation be attempted." + } + ] + }, + "defaultValue": "1", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2847, + "kind": 32768, + "kindString": "Parameter", + "name": "max_retries", + "type": { + "name": "int", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Used to report information about retries as they happen." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2848, + "kind": 32768, + "kindString": "Parameter", + "name": "logger", + "type": { + "name": "Logger", + "type": "reference" + } + } + ], + "type": { + "name": "T", + "type": "reference", + "target": "117" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Wait for all tasks to finish or until the timeout is reached.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2849, + "module": "_utils.wait", + "name": "wait_for_all_tasks_for_finish", + "parsedDocstring": { + "text": "Wait for all tasks to finish or until the timeout is reached.\n", + "args": { + "tasks": "A sequence of asyncio tasks to wait for.", + "logger": "Logger to use for reporting.", + "timeout": "How long should we wait before cancelling the tasks." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/wait.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 49 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Wait for all tasks to finish or until the timeout is reached.\n" + } + ] + }, + "flags": {}, + "id": 2850, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "wait_for_all_tasks_for_finish", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A sequence of asyncio tasks to wait for." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2851, + "kind": 32768, + "kindString": "Parameter", + "name": "tasks", + "type": { + "name": "Sequence", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "asyncio.Task" + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Logger to use for reporting." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2852, + "kind": 32768, + "kindString": "Parameter", + "name": "logger", + "type": { + "name": "Logger", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "How long should we wait before cancelling the tasks." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2853, + "kind": 32768, + "kindString": "Parameter", + "name": "timeout", + "type": { + "name": "timedelta | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "timedelta" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Check if a URL is absolute." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2854, + "module": "_utils.urls", + "name": "is_url_absolute", + "parsedDocstring": { + "text": "Check if a URL is absolute." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/urls.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 12 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Check if a URL is absolute." + } + ] + }, + "flags": {}, + "id": 2855, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "is_url_absolute", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2856, + "kind": 32768, + "kindString": "Parameter", + "name": "url", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "bool", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Convert a relative URL to an absolute URL using a base URL." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2857, + "module": "_utils.urls", + "name": "convert_to_absolute_url", + "parsedDocstring": { + "text": "Convert a relative URL to an absolute URL using a base URL." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/urls.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 20 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Convert a relative URL to an absolute URL using a base URL." + } + ] + }, + "flags": {}, + "id": 2858, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "convert_to_absolute_url", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2859, + "kind": 32768, + "kindString": "Parameter", + "name": "base_url", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2860, + "kind": 32768, + "kindString": "Parameter", + "name": "relative_url", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "str", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Convert an iterator of relative URLs to absolute URLs using a base URL." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2861, + "module": "_utils.urls", + "name": "to_absolute_url_iterator", + "parsedDocstring": { + "text": "Convert an iterator of relative URLs to absolute URLs using a base URL." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/urls.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 25 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Convert an iterator of relative URLs to absolute URLs using a base URL." + } + ] + }, + "flags": {}, + "id": 2862, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "to_absolute_url_iterator", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2863, + "kind": 32768, + "kindString": "Parameter", + "name": "base_url", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2864, + "kind": 32768, + "kindString": "Parameter", + "name": "urls", + "type": { + "name": "Iterator", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + } + ], + "type": { + "name": "Iterator", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Validate the given HTTP URL.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2865, + "module": "_utils.urls", + "name": "validate_http_url", + "parsedDocstring": { + "text": "Validate the given HTTP URL.\n" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/urls.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 37 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Validate the given HTTP URL.\n" + } + ] + }, + "flags": {}, + "id": 2866, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "validate_http_url", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2867, + "kind": 32768, + "kindString": "Parameter", + "name": "value", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Context manager to attempt importing symbols into a module.\n\nIf an `ImportError` is raised during the import, the symbol will be replaced with a `FailedImport` object." + } + ] + }, + "decorations": [ + { + "name": "contextmanager" + } + ], + "flags": {}, + "groups": [], + "id": 2868, + "module": "_utils.try_import", + "name": "try_import", + "parsedDocstring": { + "text": "Context manager to attempt importing symbols into a module.\n\nIf an `ImportError` is raised during the import, the symbol will be replaced with a `FailedImport` object." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/try_import.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 10 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Context manager to attempt importing symbols into a module.\n\nIf an `ImportError` is raised during the import, the symbol will be replaced with a `FailedImport` object." + } + ] + }, + "flags": {}, + "id": 2869, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "try_import", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2870, + "kind": 32768, + "kindString": "Parameter", + "name": "module_name", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2871, + "kind": 32768, + "kindString": "Parameter", + "name": "symbol_names", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "Iterator", + "type": "reference", + "typeArguments": [ + { + "type": "literal", + "value": null + } + ] + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Install an import hook for a specified module." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2872, + "module": "_utils.try_import", + "name": "install_import_hook", + "parsedDocstring": { + "text": "Install an import hook for a specified module." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/try_import.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 22 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Install an import hook for a specified module." + } + ] + }, + "flags": {}, + "id": 2873, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "install_import_hook", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2874, + "kind": 32768, + "kindString": "Parameter", + "name": "module_name", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The error message associated with the failed import." + } + ] + }, + "flags": {}, + "groups": [], + "id": 2876, + "module": "_utils.try_import", + "name": "message", + "parsedDocstring": { + "text": "The error message associated with the failed import." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/try_import.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 31 + } + ], + "type": { + "name": "str", + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Represent a placeholder for a failed import." + } + ] + }, + "decorations": [ + { + "name": "dataclass" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 2876 + ], + "title": "Properties" + } + ], + "id": 2875, + "module": "_utils.try_import", + "name": "FailedImport", + "parsedDocstring": { + "text": "Represent a placeholder for a failed import." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/try_import.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 28 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2878, + "module": "_utils.try_import", + "name": "__getattribute__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/try_import.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 38 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 2879, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__getattribute__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2880, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "Any", + "type": "reference" + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A wrapper class for modules to handle attribute access for failed imports." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 2878 + ], + "title": "Methods" + } + ], + "id": 2877, + "module": "_utils.try_import", + "name": "ImportWrapper", + "parsedDocstring": { + "text": "A wrapper class for modules to handle attribute access for failed imports." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/try_import.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 35 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2881, + "module": "_utils.system", + "name": "logger", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/system.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 15 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2883, + "module": "_utils.system", + "name": "model_config", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/system.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 39 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The ratio of CPU currently in use, represented as a float between 0 and 1." + } + ] + }, + "flags": {}, + "groups": [], + "id": 2884, + "module": "_utils.system", + "name": "used_ratio", + "parsedDocstring": { + "text": "The ratio of CPU currently in use, represented as a float between 0 and 1." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/system.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 41 + } + ], + "type": { + "name": "float", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The time at which the measurement was taken." + } + ] + }, + "flags": {}, + "groups": [], + "id": 2885, + "module": "_utils.system", + "name": "created_at", + "parsedDocstring": { + "text": "The time at which the measurement was taken." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/system.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 44 + } + ], + "type": { + "name": "datetime", + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Information about the CPU usage." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 2885, + 2883, + 2884 + ], + "title": "Properties" + } + ], + "id": 2882, + "module": "_utils.system", + "name": "CpuInfo", + "parsedDocstring": { + "text": "Information about the CPU usage." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/system.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 36 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2887, + "module": "_utils.system", + "name": "model_config", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/system.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 54 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Memory usage of the current Python process and its children." + } + ] + }, + "flags": {}, + "groups": [], + "id": 2888, + "module": "_utils.system", + "name": "current_size", + "parsedDocstring": { + "text": "Memory usage of the current Python process and its children." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/system.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 56 + } + ], + "type": { + "name": "ByteSize", + "type": "reference", + "target": "3047" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The time at which the measurement was taken." + } + ] + }, + "flags": {}, + "groups": [], + "id": 2889, + "module": "_utils.system", + "name": "created_at", + "parsedDocstring": { + "text": "The time at which the measurement was taken." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/system.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 64 + } + ], + "type": { + "name": "datetime", + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Information about the memory usage." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 2889, + 2888, + 2887 + ], + "title": "Properties" + } + ], + "id": 2886, + "module": "_utils.system", + "name": "MemoryUsageInfo", + "parsedDocstring": { + "text": "Information about the memory usage." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/system.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 51 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedBy": [ + { + "name": "MemoryInfo", + "target": "2890", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2891, + "module": "_utils.system", + "name": "model_config", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/system.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 74 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "overwrites": { + "name": "MemoryUsageInfo.model_config", + "target": 2887, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Total memory available in the system." + } + ] + }, + "flags": {}, + "groups": [], + "id": 2892, + "module": "_utils.system", + "name": "total_size", + "parsedDocstring": { + "text": "Total memory available in the system." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/system.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 76 + } + ], + "type": { + "name": "ByteSize", + "type": "reference", + "target": "3047" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Memory usage of the current Python process and its children." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3216, + "module": "_utils.system", + "name": "current_size", + "parsedDocstring": { + "text": "Memory usage of the current Python process and its children." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/system.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 56 + } + ], + "type": { + "name": "Annotated[ ByteSize, PlainValidator(ByteSize.validate), PlainSerializer(lambda size: size.bytes), Field(alias='currentSize'), ]", + "type": "reference" + }, + "inheritedFrom": { + "name": "MemoryUsageInfo.current_size", + "target": 2888, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The time at which the measurement was taken." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3217, + "module": "_utils.system", + "name": "created_at", + "parsedDocstring": { + "text": "The time at which the measurement was taken." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/system.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 64 + } + ], + "type": { + "name": "datetime", + "type": "reference" + }, + "inheritedFrom": { + "name": "MemoryUsageInfo.created_at", + "target": 2889, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Information about system memory." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 3217, + 3216, + 2891, + 2892 + ], + "title": "Properties" + } + ], + "id": 2890, + "module": "_utils.system", + "name": "MemoryInfo", + "parsedDocstring": { + "text": "Information about system memory." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/system.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 71 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "MemoryUsageInfo", + "target": "2886", + "type": "reference" + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Retrieve the current CPU usage.\n\nIt utilizes the `psutil` library. Function `psutil.cpu_percent()` returns a float representing the current\nsystem-wide CPU utilization as a percentage." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2893, + "module": "_utils.system", + "name": "get_cpu_info", + "parsedDocstring": { + "text": "Retrieve the current CPU usage.\n\nIt utilizes the `psutil` library. Function `psutil.cpu_percent()` returns a float representing the current\nsystem-wide CPU utilization as a percentage." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/system.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 82 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Retrieve the current CPU usage.\n\nIt utilizes the `psutil` library. Function `psutil.cpu_percent()` returns a float representing the current\nsystem-wide CPU utilization as a percentage." + } + ] + }, + "flags": {}, + "id": 2894, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "get_cpu_info", + "parameters": [], + "type": { + "name": "CpuInfo", + "type": "reference", + "target": "2882" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Retrieve the current memory usage of the process and its children.\n\nIt utilizes the `psutil` library." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2895, + "module": "_utils.system", + "name": "get_memory_info", + "parsedDocstring": { + "text": "Retrieve the current memory usage of the process and its children.\n\nIt utilizes the `psutil` library." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/system.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 93 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Retrieve the current memory usage of the process and its children.\n\nIt utilizes the `psutil` library." + } + ] + }, + "flags": {}, + "id": 2896, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "get_memory_info", + "parameters": [], + "type": { + "name": "MemoryInfo", + "type": "reference", + "target": "2890" + } + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2898, + "module": "_utils.robots", + "name": "__init__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/robots.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 18 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 2899, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2900, + "kind": 32768, + "kindString": "Parameter", + "name": "url", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2901, + "kind": 32768, + "kindString": "Parameter", + "name": "robots", + "type": { + "name": "Protego", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a `RobotsTxtFile` instance from the given content.\n" + } + ] + }, + "decorations": [ + { + "name": "classmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2902, + "module": "_utils.robots", + "name": "from_content", + "parsedDocstring": { + "text": "Create a `RobotsTxtFile` instance from the given content.\n", + "args": { + "url": "The URL associated with the robots.txt file.", + "content": "The raw string content of the robots.txt file to be parsed." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/robots.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 23 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a `RobotsTxtFile` instance from the given content.\n" + } + ] + }, + "flags": {}, + "id": 2903, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "from_content", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The URL associated with the robots.txt file." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2904, + "kind": 32768, + "kindString": "Parameter", + "name": "url", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The raw string content of the robots.txt file to be parsed." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2905, + "kind": 32768, + "kindString": "Parameter", + "name": "content", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "Self", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Determine the location of a robots.txt file for a URL and fetch it.\n" + } + ] + }, + "decorations": [ + { + "name": "classmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2906, + "module": "_utils.robots", + "name": "find", + "parsedDocstring": { + "text": "Determine the location of a robots.txt file for a URL and fetch it.\n", + "args": { + "url": "The URL whose domain will be used to find the corresponding robots.txt file.", + "http_client": "Optional `ProxyInfo` to be used when fetching the robots.txt file. If None, no proxy is used.", + "proxy_info": "The `HttpClient` instance used to perform the network request for fetching the robots.txt file." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/robots.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 34 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Determine the location of a robots.txt file for a URL and fetch it.\n" + } + ] + }, + "flags": {}, + "id": 2907, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "find", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The URL whose domain will be used to find the corresponding robots.txt file." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2908, + "kind": 32768, + "kindString": "Parameter", + "name": "url", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Optional `ProxyInfo` to be used when fetching the robots.txt file. If None, no proxy is used." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2909, + "kind": 32768, + "kindString": "Parameter", + "name": "http_client", + "type": { + "name": "HttpClient", + "type": "reference", + "target": "1919" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The `HttpClient` instance used to perform the network request for fetching the robots.txt file." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 2910, + "kind": 32768, + "kindString": "Parameter", + "name": "proxy_info", + "type": { + "name": "ProxyInfo | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ProxyInfo", + "target": "15" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "Self", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Load the robots.txt file for a given URL.\n" + } + ] + }, + "decorations": [ + { + "name": "classmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2911, + "module": "_utils.robots", + "name": "load", + "parsedDocstring": { + "text": "Load the robots.txt file for a given URL.\n", + "args": { + "url": "The direct URL of the robots.txt file to be loaded.", + "http_client": "The `HttpClient` instance used to perform the network request for fetching the robots.txt file.", + "proxy_info": "Optional `ProxyInfo` to be used when fetching the robots.txt file. If None, no proxy is used." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/robots.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 46 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Load the robots.txt file for a given URL.\n" + } + ] + }, + "flags": {}, + "id": 2912, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "load", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The direct URL of the robots.txt file to be loaded." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2913, + "kind": 32768, + "kindString": "Parameter", + "name": "url", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The `HttpClient` instance used to perform the network request for fetching the robots.txt file." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2914, + "kind": 32768, + "kindString": "Parameter", + "name": "http_client", + "type": { + "name": "HttpClient", + "type": "reference", + "target": "1919" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Optional `ProxyInfo` to be used when fetching the robots.txt file. If None, no proxy is used." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 2915, + "kind": 32768, + "kindString": "Parameter", + "name": "proxy_info", + "type": { + "name": "ProxyInfo | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ProxyInfo", + "target": "15" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "Self", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Check if the given URL is allowed for the given user agent.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2916, + "module": "_utils.robots", + "name": "is_allowed", + "parsedDocstring": { + "text": "Check if the given URL is allowed for the given user agent.\n", + "args": { + "url": "The URL to check against the robots.txt rules.", + "user_agent": "The user-agent string to check permissions for. Defaults to '*' which matches any user-agent." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/robots.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 61 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Check if the given URL is allowed for the given user agent.\n" + } + ] + }, + "flags": {}, + "id": 2917, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "is_allowed", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The URL to check against the robots.txt rules." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2918, + "kind": 32768, + "kindString": "Parameter", + "name": "url", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The user-agent string to check permissions for. Defaults to '*' which matches any user-agent." + } + ] + }, + "defaultValue": "'*'", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 2919, + "kind": 32768, + "kindString": "Parameter", + "name": "user_agent", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "bool", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the list of sitemaps urls from the robots.txt file." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2920, + "module": "_utils.robots", + "name": "get_sitemaps", + "parsedDocstring": { + "text": "Get the list of sitemaps urls from the robots.txt file." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/robots.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 73 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the list of sitemaps urls from the robots.txt file." + } + ] + }, + "flags": {}, + "id": 2921, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "get_sitemaps", + "parameters": [], + "type": { + "name": "list", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ], + "target": "866" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the crawl delay for the given user agent.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2922, + "module": "_utils.robots", + "name": "get_crawl_delay", + "parsedDocstring": { + "text": "Get the crawl delay for the given user agent.\n", + "args": { + "user_agent": "The user-agent string to check the crawl delay for. Defaults to '*' which matches any\nuser-agent." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/robots.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 77 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the crawl delay for the given user agent.\n" + } + ] + }, + "flags": {}, + "id": 2923, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "get_crawl_delay", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The user-agent string to check the crawl delay for. Defaults to '*' which matches any\nuser-agent." + } + ] + }, + "defaultValue": "'*'", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 2924, + "kind": 32768, + "kindString": "Parameter", + "name": "user_agent", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "int | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 2898, + 2906, + 2902, + 2922, + 2920, + 2916, + 2911 + ], + "title": "Methods" + } + ], + "id": 2897, + "module": "_utils.robots", + "name": "RobotsTxtFile", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/robots.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 17 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2925, + "module": "_utils.requests", + "name": "logger", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/requests.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 16 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Generate a deterministic request ID based on a unique key.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2926, + "module": "_utils.requests", + "name": "unique_key_to_request_id", + "parsedDocstring": { + "text": "Generate a deterministic request ID based on a unique key.\n", + "args": { + "unique_key": "The unique key to convert into a request ID.", + "request_id_length": "The length of the request ID.\n" + }, + "returns": "A URL-safe, truncated request ID based on the unique key." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/requests.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 19 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "A URL-safe, truncated request ID based on the unique key." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Generate a deterministic request ID based on a unique key.\n" + } + ] + }, + "flags": {}, + "id": 2927, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "unique_key_to_request_id", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The unique key to convert into a request ID." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2928, + "kind": 32768, + "kindString": "Parameter", + "name": "unique_key", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The length of the request ID.\n" + } + ] + }, + "defaultValue": "15", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2929, + "kind": 32768, + "kindString": "Parameter", + "name": "request_id_length", + "type": { + "name": "int", + "type": "reference" + } + } + ], + "type": { + "name": "str", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Normalize a URL.\n\nThis function cleans and standardizes a URL by removing leading and trailing whitespaces,\nconverting the scheme and netloc to lower case, stripping unwanted tracking parameters\n(specifically those beginning with 'utm_'), sorting the remaining query parameters alphabetically,\nand optionally retaining the URL fragment. The goal is to ensure that URLs that are functionally\nidentical but differ in trivial ways (such as parameter order or casing) are treated as the same.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2930, + "module": "_utils.requests", + "name": "normalize_url", + "parsedDocstring": { + "text": "Normalize a URL.\n\nThis function cleans and standardizes a URL by removing leading and trailing whitespaces,\nconverting the scheme and netloc to lower case, stripping unwanted tracking parameters\n(specifically those beginning with 'utm_'), sorting the remaining query parameters alphabetically,\nand optionally retaining the URL fragment. The goal is to ensure that URLs that are functionally\nidentical but differ in trivial ways (such as parameter order or casing) are treated as the same.\n", + "args": { + "url": "The URL to be normalized.", + "keep_url_fragment": "Flag to determine whether the fragment part of the URL should be retained.\n" + }, + "returns": "A string containing the normalized URL." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/requests.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 42 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "A string containing the normalized URL." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Normalize a URL.\n\nThis function cleans and standardizes a URL by removing leading and trailing whitespaces,\nconverting the scheme and netloc to lower case, stripping unwanted tracking parameters\n(specifically those beginning with 'utm_'), sorting the remaining query parameters alphabetically,\nand optionally retaining the URL fragment. The goal is to ensure that URLs that are functionally\nidentical but differ in trivial ways (such as parameter order or casing) are treated as the same.\n" + } + ] + }, + "flags": {}, + "id": 2931, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "normalize_url", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The URL to be normalized." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2932, + "kind": 32768, + "kindString": "Parameter", + "name": "url", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Flag to determine whether the fragment part of the URL should be retained.\n" + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2933, + "kind": 32768, + "kindString": "Parameter", + "name": "keep_url_fragment", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "str", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Compute a unique key for caching & deduplication of requests.\n\nThis function computes a unique key by normalizing the provided URL and method. If `use_extended_unique_key`\nis True and a payload is provided, the payload is hashed and included in the key. Otherwise, the unique key\nis just the normalized URL. Additionally, if HTTP headers are provided, the whitelisted headers are hashed\nand included in the key.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2934, + "module": "_utils.requests", + "name": "compute_unique_key", + "parsedDocstring": { + "text": "Compute a unique key for caching & deduplication of requests.\n\nThis function computes a unique key by normalizing the provided URL and method. If `use_extended_unique_key`\nis True and a payload is provided, the payload is hashed and included in the key. Otherwise, the unique key\nis just the normalized URL. Additionally, if HTTP headers are provided, the whitelisted headers are hashed\nand included in the key.\n", + "args": { + "url": "The request URL.", + "method": "The HTTP method.", + "headers": "The HTTP headers.", + "payload": "The data to be sent as the request body.", + "keep_url_fragment": "A flag indicating whether to keep the URL fragment.", + "use_extended_unique_key": "A flag indicating whether to include a hashed payload in the key.", + "session_id": "The ID of a specific `Session` to which the request will be strictly bound\n" + }, + "returns": "A string representing the unique key for the request." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/requests.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 76 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "A string representing the unique key for the request." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Compute a unique key for caching & deduplication of requests.\n\nThis function computes a unique key by normalizing the provided URL and method. If `use_extended_unique_key`\nis True and a payload is provided, the payload is hashed and included in the key. Otherwise, the unique key\nis just the normalized URL. Additionally, if HTTP headers are provided, the whitelisted headers are hashed\nand included in the key.\n" + } + ] + }, + "flags": {}, + "id": 2935, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "compute_unique_key", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The request URL." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2936, + "kind": 32768, + "kindString": "Parameter", + "name": "url", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The HTTP method." + } + ] + }, + "defaultValue": "'GET'", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 2937, + "kind": 32768, + "kindString": "Parameter", + "name": "method", + "type": { + "name": "Literal", + "type": "reference", + "typeArguments": [ + { + "type": "literal", + "value": "GET" + }, + { + "type": "literal", + "value": "HEAD" + }, + { + "type": "literal", + "value": "POST" + }, + { + "type": "literal", + "value": "PUT" + }, + { + "type": "literal", + "value": "DELETE" + }, + { + "type": "literal", + "value": "CONNECT" + }, + { + "type": "literal", + "value": "OPTIONS" + }, + { + "type": "literal", + "value": "TRACE" + }, + { + "type": "literal", + "value": "PATCH" + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The HTTP headers." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 2938, + "kind": 32768, + "kindString": "Parameter", + "name": "headers", + "type": { + "name": "HttpHeaders | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "HttpHeaders", + "target": "123" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The data to be sent as the request body." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 2939, + "kind": 32768, + "kindString": "Parameter", + "name": "payload", + "type": { + "name": "HttpPayload | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "bytes", + "target": "3048" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The ID of a specific `Session` to which the request will be strictly bound\n" + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 2940, + "kind": 32768, + "kindString": "Parameter", + "name": "session_id", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A flag indicating whether to keep the URL fragment." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2941, + "kind": 32768, + "kindString": "Parameter", + "name": "keep_url_fragment", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A flag indicating whether to include a hashed payload in the key." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2942, + "kind": 32768, + "kindString": "Parameter", + "name": "use_extended_unique_key", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "str", + "type": "reference" + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2943, + "module": "_utils.recurring_task", + "name": "logger", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/recurring_task.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 10 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2945, + "module": "_utils.recurring_task", + "name": "__init__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/recurring_task.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 22 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 2946, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2947, + "kind": 32768, + "kindString": "Parameter", + "name": "func", + "type": { + "name": "Callable", + "type": "reference" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2948, + "kind": 32768, + "kindString": "Parameter", + "name": "delay", + "type": { + "name": "timedelta", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Start the recurring task execution." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2949, + "module": "_utils.recurring_task", + "name": "start", + "parsedDocstring": { + "text": "Start the recurring task execution." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/recurring_task.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 39 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Start the recurring task execution." + } + ] + }, + "flags": {}, + "id": 2950, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "start", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Stop the recurring task execution." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2951, + "module": "_utils.recurring_task", + "name": "stop", + "parsedDocstring": { + "text": "Stop the recurring task execution." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/recurring_task.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 43 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Stop the recurring task execution." + } + ] + }, + "flags": {}, + "id": 2952, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "stop", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Class for creating and managing recurring tasks.\n" + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 2945, + 2949, + 2951 + ], + "title": "Methods" + } + ], + "id": 2944, + "module": "_utils.recurring_task", + "name": "RecurringTask", + "parsedDocstring": { + "text": "Class for creating and managing recurring tasks.\n" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/recurring_task.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 13 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2953, + "module": "_utils.recoverable_state", + "name": "TStateModel", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/recoverable_state.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 14 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new recoverable state object.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2955, + "module": "_utils.recoverable_state", + "name": "__init__", + "parsedDocstring": { + "text": "Initialize a new recoverable state object.\n", + "args": { + "default_state": "The default state model instance to use when no persisted state is found.\nA deep copy is made each time the state is used.", + "persist_state_key": "The key under which the state is stored in the KeyValueStore", + "persistence_enabled": "Flag to enable or disable state persistence", + "persist_state_kvs_name": "The name of the KeyValueStore to use for persistence.\nIf neither a name nor and id are supplied, the default store will be used.", + "persist_state_kvs_id": "The identifier of the KeyValueStore to use for persistence.\nIf neither a name nor and id are supplied, the default store will be used.", + "logger": "A logger instance for logging operations related to state persistence" + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/recoverable_state.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 32 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new recoverable state object.\n" + } + ] + }, + "flags": {}, + "id": 2956, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The default state model instance to use when no persisted state is found.\nA deep copy is made each time the state is used." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2957, + "kind": 32768, + "kindString": "Parameter", + "name": "default_state", + "type": { + "name": "TStateModel", + "type": "reference", + "target": "2953" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The key under which the state is stored in the KeyValueStore" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2958, + "kind": 32768, + "kindString": "Parameter", + "name": "persist_state_key", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Flag to enable or disable state persistence" + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2959, + "kind": 32768, + "kindString": "Parameter", + "name": "persistence_enabled", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The name of the KeyValueStore to use for persistence.\nIf neither a name nor and id are supplied, the default store will be used." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2960, + "kind": 32768, + "kindString": "Parameter", + "name": "persist_state_kvs_name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The identifier of the KeyValueStore to use for persistence.\nIf neither a name nor and id are supplied, the default store will be used." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2961, + "kind": 32768, + "kindString": "Parameter", + "name": "persist_state_kvs_id", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A logger instance for logging operations related to state persistence" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2962, + "kind": 32768, + "kindString": "Parameter", + "name": "logger", + "type": { + "name": "logging.Logger", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize the recoverable state.\n\nThis method must be called before using the recoverable state. It loads the saved state\nif persistence is enabled and registers the object to listen for PERSIST_STATE events.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2963, + "module": "_utils.recoverable_state", + "name": "initialize", + "parsedDocstring": { + "text": "Initialize the recoverable state.\n\nThis method must be called before using the recoverable state. It loads the saved state\nif persistence is enabled and registers the object to listen for PERSIST_STATE events.\n", + "returns": "The loaded state model" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/recoverable_state.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 65 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "The loaded state model" + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Initialize the recoverable state.\n\nThis method must be called before using the recoverable state. It loads the saved state\nif persistence is enabled and registers the object to listen for PERSIST_STATE events.\n" + } + ] + }, + "flags": {}, + "id": 2964, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "initialize", + "parameters": [], + "type": { + "name": "TStateModel", + "type": "reference", + "target": "2953" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Clean up resources used by the recoverable state.\n\nIf persistence is enabled, this method deregisters the object from PERSIST_STATE events\nand persists the current state one last time." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2965, + "module": "_utils.recoverable_state", + "name": "teardown", + "parsedDocstring": { + "text": "Clean up resources used by the recoverable state.\n\nIf persistence is enabled, this method deregisters the object from PERSIST_STATE events\nand persists the current state one last time." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/recoverable_state.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 89 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Clean up resources used by the recoverable state.\n\nIf persistence is enabled, this method deregisters the object from PERSIST_STATE events\nand persists the current state one last time." + } + ] + }, + "flags": {}, + "id": 2966, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "teardown", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the current state." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 2967, + "module": "_utils.recoverable_state", + "name": "current_value", + "parsedDocstring": { + "text": "Get the current state." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/recoverable_state.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 103 + } + ], + "type": { + "name": "TStateModel", + "type": "reference", + "target": "2953" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Reset the state to the default values and clear any persisted state.\n\nResets the current state to the default state and, if persistence is enabled,\nclears the persisted state from the KeyValueStore." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2968, + "module": "_utils.recoverable_state", + "name": "reset", + "parsedDocstring": { + "text": "Reset the state to the default values and clear any persisted state.\n\nResets the current state to the default state and, if persistence is enabled,\nclears the persisted state from the KeyValueStore." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/recoverable_state.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 110 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Reset the state to the default values and clear any persisted state.\n\nResets the current state to the default state and, if persistence is enabled,\nclears the persisted state from the KeyValueStore." + } + ] + }, + "flags": {}, + "id": 2969, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "reset", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Persist the current state to the KeyValueStore.\n\nThis method is typically called in response to a PERSIST_STATE event, but can also be called\ndirectly when needed.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2970, + "module": "_utils.recoverable_state", + "name": "persist_state", + "parsedDocstring": { + "text": "Persist the current state to the KeyValueStore.\n\nThis method is typically called in response to a PERSIST_STATE event, but can also be called\ndirectly when needed.\n", + "args": { + "event_data": "Optional data associated with a PERSIST_STATE event" + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/recoverable_state.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 124 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Persist the current state to the KeyValueStore.\n\nThis method is typically called in response to a PERSIST_STATE event, but can also be called\ndirectly when needed.\n" + } + ] + }, + "flags": {}, + "id": 2971, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "persist_state", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Optional data associated with a PERSIST_STATE event" + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 2972, + "kind": 32768, + "kindString": "Parameter", + "name": "event_data", + "type": { + "name": "EventPersistStateData | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "EventPersistStateData", + "target": "2048" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A class for managing persistent recoverable state using a Pydantic model.\n\nThis class facilitates state persistence to a `KeyValueStore`, allowing data to be saved and retrieved\nacross migrations or restarts. It manages the loading, saving, and resetting of state data,\nwith optional persistence capabilities.\n\nThe state is represented by a Pydantic model that can be serialized to and deserialized from JSON.\nThe class automatically hooks into the event system to persist state when needed.\n\nType Parameters:\nTStateModel: A Pydantic BaseModel type that defines the structure of the state data.\nTypically, it should be inferred from the `default_state` constructor parameter." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 2955, + 2963, + 2970, + 2968, + 2965 + ], + "title": "Methods" + }, + { + "children": [ + 2967 + ], + "title": "Properties" + } + ], + "id": 2954, + "module": "_utils.recoverable_state", + "name": "RecoverableState", + "parsedDocstring": { + "text": "A class for managing persistent recoverable state using a Pydantic model.\n\nThis class facilitates state persistence to a `KeyValueStore`, allowing data to be saved and retrieved\nacross migrations or restarts. It manages the loading, saving, and resetting of state data,\nwith optional persistence capabilities.\n\nThe state is represented by a Pydantic model that can be serialized to and deserialized from JSON.\nThe class automatically hooks into the event system to persist state when needed.\n\nType Parameters:\nTStateModel: A Pydantic BaseModel type that defines the structure of the state data.\nTypically, it should be inferred from the `default_state` constructor parameter." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/recoverable_state.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 17 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2973, + "module": "_utils.models", + "name": "timedelta_ms", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 64 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2974, + "module": "_utils.models", + "name": "timedelta_secs", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 65 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2976, + "module": "_utils.measure_time", + "name": "wall", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/measure_time.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 14 + } + ], + "type": { + "name": "float | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "float" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2977, + "module": "_utils.measure_time", + "name": "cpu", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/measure_time.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 15 + } + ], + "type": { + "name": "float | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "float" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "name": "dataclass" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 2977, + 2976 + ], + "title": "Properties" + } + ], + "id": 2975, + "module": "_utils.measure_time", + "name": "TimerResult", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/measure_time.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 13 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Measure the execution time (wall-clock and CPU) between the start and end of the with-block." + } + ] + }, + "decorations": [ + { + "name": "contextmanager" + } + ], + "flags": {}, + "groups": [], + "id": 2978, + "module": "_utils.measure_time", + "name": "measure_time", + "parsedDocstring": { + "text": "Measure the execution time (wall-clock and CPU) between the start and end of the with-block." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/measure_time.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 19 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Measure the execution time (wall-clock and CPU) between the start and end of the with-block." + } + ] + }, + "flags": {}, + "id": 2979, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "measure_time", + "parameters": [], + "type": { + "name": "Iterator", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "TimerResult", + "target": "2975" + } + ] + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2980, + "module": "_utils.html_to_text", + "name": "SKIP_TAGS", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/html_to_text.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 9 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2981, + "module": "_utils.html_to_text", + "name": "BLOCK_TAGS", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/html_to_text.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 10 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2983, + "module": "_utils.globs", + "name": "__init__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/globs.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 14 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 2984, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2985, + "kind": 32768, + "kindString": "Parameter", + "name": "glob", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Wraps a glob pattern (supports the `*`, `**`, `?` wildcards)." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 2983 + ], + "title": "Methods" + } + ], + "id": 2982, + "module": "_utils.globs", + "name": "Glob", + "parsedDocstring": { + "text": "Wraps a glob pattern (supports the `*`, `**`, `?` wildcards)." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/globs.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 11 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 8, + "kindString": "Enumeration", + "children": [ + { + "kind": 16, + "kindString": "Enumeration Member", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2987, + "module": "_utils.file", + "name": "JSON", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/file.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 20 + } + ], + "type": { + "type": "literal", + "value": "r'^application/json'" + } + }, + { + "kind": 16, + "kindString": "Enumeration Member", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2988, + "module": "_utils.file", + "name": "TEXT", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/file.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 21 + } + ], + "type": { + "type": "literal", + "value": "r'^text/'" + } + }, + { + "kind": 16, + "kindString": "Enumeration Member", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2989, + "module": "_utils.file", + "name": "XML", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/file.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 22 + } + ], + "type": { + "type": "literal", + "value": "r'^application/.*xml$'" + } + }, + { + "kind": 16, + "kindString": "Enumeration Member", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Check if the content type matches the enum's pattern." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2990, + "module": "_utils.file", + "name": "matches", + "parsedDocstring": { + "text": "Check if the content type matches the enum's pattern." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/file.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 24 + } + ], + "type": { + "type": "literal" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 2987, + 2990, + 2988, + 2989 + ], + "title": "Enumeration members" + } + ], + "id": 2986, + "module": "_utils.file", + "name": "ContentType", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/file.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 19 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Check if the provided content type string matches the specified ContentType." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2991, + "module": "_utils.file", + "name": "is_content_type", + "parsedDocstring": { + "text": "Check if the provided content type string matches the specified ContentType." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/file.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 29 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Check if the provided content type string matches the specified ContentType." + } + ] + }, + "flags": {}, + "id": 2992, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "is_content_type", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2993, + "kind": 32768, + "kindString": "Parameter", + "name": "content_type_enum", + "type": { + "name": "ContentType", + "type": "reference", + "target": "2986" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2994, + "kind": 32768, + "kindString": "Parameter", + "name": "content_type", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "bool", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Remove a file, suppressing the FileNotFoundError if it does not exist.\n\nJS-like rm(filename, { force: true }).\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2995, + "module": "_utils.file", + "name": "force_remove", + "parsedDocstring": { + "text": "Remove a file, suppressing the FileNotFoundError if it does not exist.\n\nJS-like rm(filename, { force: true }).\n", + "args": { + "filename": "The path to the file to be removed." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/file.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 34 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Remove a file, suppressing the FileNotFoundError if it does not exist.\n\nJS-like rm(filename, { force: true }).\n" + } + ] + }, + "flags": {}, + "id": 2996, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "force_remove", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The path to the file to be removed." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2997, + "kind": 32768, + "kindString": "Parameter", + "name": "filename", + "type": { + "name": "str | Path", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Path" + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Rename a directory, ensuring that the destination directory is removed if it exists.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2998, + "module": "_utils.file", + "name": "force_rename", + "parsedDocstring": { + "text": "Rename a directory, ensuring that the destination directory is removed if it exists.\n", + "args": { + "src_dir": "The source directory path.", + "dst_dir": "The destination directory path." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/file.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 46 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Rename a directory, ensuring that the destination directory is removed if it exists.\n" + } + ] + }, + "flags": {}, + "id": 2999, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "force_rename", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The source directory path." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 3000, + "kind": 32768, + "kindString": "Parameter", + "name": "src_dir", + "type": { + "name": "str | Path", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Path" + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The destination directory path." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 3001, + "kind": 32768, + "kindString": "Parameter", + "name": "dst_dir", + "type": { + "name": "str | Path", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Path" + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Determine the file extension for a given MIME content type.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3002, + "module": "_utils.file", + "name": "determine_file_extension", + "parsedDocstring": { + "text": "Determine the file extension for a given MIME content type.\n", + "args": { + "content_type": "The MIME content type string.\n" + }, + "returns": "A string representing the determined file extension without a leading dot,\nor None if no extension could be determined." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/file.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 61 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "A string representing the determined file extension without a leading dot,\nor None if no extension could be determined." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Determine the file extension for a given MIME content type.\n" + } + ] + }, + "flags": {}, + "id": 3003, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "determine_file_extension", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The MIME content type string.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 3004, + "kind": 32768, + "kindString": "Parameter", + "name": "content_type", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Determine if the input value is a file-like object or bytes.\n\nThis function checks whether the provided value is an instance of bytes, bytearray, or io.IOBase (file-like).\nThe method is simplified for common use cases and may not cover all edge cases.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3005, + "module": "_utils.file", + "name": "is_file_or_bytes", + "parsedDocstring": { + "text": "Determine if the input value is a file-like object or bytes.\n\nThis function checks whether the provided value is an instance of bytes, bytearray, or io.IOBase (file-like).\nThe method is simplified for common use cases and may not cover all edge cases.\n", + "args": { + "value": "The value to be checked.\n" + }, + "returns": "True if the value is either a file-like object or bytes, False otherwise." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/file.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 86 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "True if the value is either a file-like object or bytes, False otherwise." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Determine if the input value is a file-like object or bytes.\n\nThis function checks whether the provided value is an instance of bytes, bytearray, or io.IOBase (file-like).\nThe method is simplified for common use cases and may not cover all edge cases.\n" + } + ] + }, + "flags": {}, + "id": 3006, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "is_file_or_bytes", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The value to be checked.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 3007, + "kind": 32768, + "kindString": "Parameter", + "name": "value", + "type": { + "name": "Any", + "type": "reference" + } + } + ], + "type": { + "name": "bool", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Serialize an object to a JSON-formatted string with specific settings.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3008, + "module": "_utils.file", + "name": "json_dumps", + "parsedDocstring": { + "text": "Serialize an object to a JSON-formatted string with specific settings.\n", + "args": { + "obj": "The object to serialize.\n" + }, + "returns": "A string containing the JSON representation of the input object." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/file.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 101 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "A string containing the JSON representation of the input object." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Serialize an object to a JSON-formatted string with specific settings.\n" + } + ] + }, + "flags": {}, + "id": 3009, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "json_dumps", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The object to serialize.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 3010, + "kind": 32768, + "kindString": "Parameter", + "name": "obj", + "type": { + "name": "Any", + "type": "reference" + } + } + ], + "type": { + "name": "str", + "type": "reference" + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3011, + "module": "_utils.docs", + "name": "GroupName", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/docs.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 5 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Mark a symbol for rendering and grouping in documentation.\n\nThis decorator is used solely for documentation purposes and does not modify the behavior\nof the decorated callable.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3012, + "module": "_utils.docs", + "name": "docs_group", + "parsedDocstring": { + "text": "Mark a symbol for rendering and grouping in documentation.\n\nThis decorator is used solely for documentation purposes and does not modify the behavior\nof the decorated callable.\n", + "args": { + "group_name": "The documentation group to which the symbol belongs.\n" + }, + "returns": "The original callable without modification." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/docs.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 8 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "The original callable without modification." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Mark a symbol for rendering and grouping in documentation.\n\nThis decorator is used solely for documentation purposes and does not modify the behavior\nof the decorated callable.\n" + } + ] + }, + "flags": {}, + "id": 3013, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "docs_group", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The documentation group to which the symbol belongs.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 3014, + "kind": 32768, + "kindString": "Parameter", + "name": "group_name", + "type": { + "name": "GroupName", + "type": "reference", + "target": "1495" + } + } + ], + "type": { + "name": "Callable", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Extract the value of an enumeration member if it is an Enum, otherwise return the original value." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3015, + "module": "_utils.data_processing", + "name": "maybe_extract_enum_member_value", + "parsedDocstring": { + "text": "Extract the value of an enumeration member if it is an Enum, otherwise return the original value." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/data_processing.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 13 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Extract the value of an enumeration member if it is an Enum, otherwise return the original value." + } + ] + }, + "flags": {}, + "id": 3016, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "maybe_extract_enum_member_value", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 3017, + "kind": 32768, + "kindString": "Parameter", + "name": "maybe_enum_member", + "type": { + "name": "Any", + "type": "reference" + } + } + ], + "type": { + "name": "Any", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Parse the response body based on the content type." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3018, + "module": "_utils.data_processing", + "name": "maybe_parse_body", + "parsedDocstring": { + "text": "Parse the response body based on the content type." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/data_processing.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 20 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Parse the response body based on the content type." + } + ] + }, + "flags": {}, + "id": 3019, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "maybe_parse_body", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 3020, + "kind": 32768, + "kindString": "Parameter", + "name": "body", + "type": { + "name": "bytes", + "type": "reference", + "target": "3048" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 3021, + "kind": 32768, + "kindString": "Parameter", + "name": "content_type", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "Any", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Raise an error indicating that a storage with the provided key name and value already exists." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3022, + "module": "_utils.data_processing", + "name": "raise_on_duplicate_storage", + "parsedDocstring": { + "text": "Raise an error indicating that a storage with the provided key name and value already exists." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/data_processing.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 32 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Raise an error indicating that a storage with the provided key name and value already exists." + } + ] + }, + "flags": {}, + "id": 3023, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "raise_on_duplicate_storage", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 3024, + "kind": 32768, + "kindString": "Parameter", + "name": "client_type", + "type": { + "name": "StorageTypes", + "type": "reference", + "target": "153" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 3025, + "kind": 32768, + "kindString": "Parameter", + "name": "key_name", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 3026, + "kind": 32768, + "kindString": "Parameter", + "name": "value", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "NoReturn", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Raise an error indicating that a storage with the provided id does not exist." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3027, + "module": "_utils.data_processing", + "name": "raise_on_non_existing_storage", + "parsedDocstring": { + "text": "Raise an error indicating that a storage with the provided id does not exist." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/data_processing.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 38 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Raise an error indicating that a storage with the provided id does not exist." + } + ] + }, + "flags": {}, + "id": 3028, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "raise_on_non_existing_storage", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 3029, + "kind": 32768, + "kindString": "Parameter", + "name": "client_type", + "type": { + "name": "StorageTypes", + "type": "reference", + "target": "153" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 3030, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "NoReturn", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Compute a hexadecimal SHA-256 hash of the provided data and returns a substring (prefix) of it.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3031, + "module": "_utils.crypto", + "name": "compute_short_hash", + "parsedDocstring": { + "text": "Compute a hexadecimal SHA-256 hash of the provided data and returns a substring (prefix) of it.\n", + "args": { + "data": "The binary data to be hashed.", + "length": "The length of the hash to be returned.\n" + }, + "returns": "A substring (prefix) of the hexadecimal hash of the data." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/crypto.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 7 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "A substring (prefix) of the hexadecimal hash of the data." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Compute a hexadecimal SHA-256 hash of the provided data and returns a substring (prefix) of it.\n" + } + ] + }, + "flags": {}, + "id": 3032, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "compute_short_hash", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The binary data to be hashed." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 3033, + "kind": 32768, + "kindString": "Parameter", + "name": "data", + "type": { + "name": "bytes", + "type": "reference", + "target": "3048" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The length of the hash to be returned.\n" + } + ] + }, + "defaultValue": "8", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 3034, + "kind": 32768, + "kindString": "Parameter", + "name": "length", + "type": { + "name": "int", + "type": "reference" + } + } + ], + "type": { + "name": "str", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Generate a random object ID." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3035, + "module": "_utils.crypto", + "name": "crypto_random_object_id", + "parsedDocstring": { + "text": "Generate a random object ID." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/crypto.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 21 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Generate a random object ID." + } + ] + }, + "flags": {}, + "id": 3036, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "crypto_random_object_id", + "parameters": [ + { + "defaultValue": "17", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 3037, + "kind": 32768, + "kindString": "Parameter", + "name": "length", + "type": { + "name": "int", + "type": "reference" + } + } + ], + "type": { + "name": "str", + "type": "reference" + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3038, + "module": "_utils.context", + "name": "T", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 7 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Ensure the (async) context manager is initialized before executing the method.\n\nThis decorator checks if the calling instance has an `active` attribute and verifies that it is set to `True`.\nIf the instance is inactive, it raises a `RuntimeError`. Works for both synchronous and asynchronous methods.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3039, + "module": "_utils.context", + "name": "ensure_context", + "parsedDocstring": { + "text": "Ensure the (async) context manager is initialized before executing the method.\n\nThis decorator checks if the calling instance has an `active` attribute and verifies that it is set to `True`.\nIf the instance is inactive, it raises a `RuntimeError`. Works for both synchronous and asynchronous methods.\n", + "args": { + "method": "The method to wrap.\n" + }, + "returns": "The wrapped method with context checking applied." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 10 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "The wrapped method with context checking applied." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Ensure the (async) context manager is initialized before executing the method.\n\nThis decorator checks if the calling instance has an `active` attribute and verifies that it is set to `True`.\nIf the instance is inactive, it raises a `RuntimeError`. Works for both synchronous and asynchronous methods.\n" + } + ] + }, + "flags": {}, + "id": 3040, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "ensure_context", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The method to wrap.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 3041, + "kind": 32768, + "kindString": "Parameter", + "name": "method", + "type": { + "name": "T", + "type": "reference", + "target": "117" + } + } + ], + "type": { + "name": "T", + "type": "reference", + "target": "117" + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3042, + "module": "_utils.console", + "name": "BORDER", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/console.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 8 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a text table using Unicode characters.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3043, + "module": "_utils.console", + "name": "make_table", + "parsedDocstring": { + "text": "Create a text table using Unicode characters.\n", + "args": { + "rows": "A list of tuples/lists to be displayed in the table.", + "width": "Maximum width of the table." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/console.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 11 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a text table using Unicode characters.\n" + } + ] + }, + "flags": {}, + "id": 3044, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "make_table", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A list of tuples/lists to be displayed in the table." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 3045, + "kind": 32768, + "kindString": "Parameter", + "name": "rows", + "type": { + "name": "Sequence", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Sequence", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum width of the table." + } + ] + }, + "defaultValue": "100", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 3046, + "kind": 32768, + "kindString": "Parameter", + "name": "width", + "type": { + "name": "int", + "type": "reference" + } + } + ], + "type": { + "name": "str", + "type": "reference" + } + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3048, + "module": "_utils.byte_size", + "name": "bytes", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/byte_size.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 16 + } + ], + "type": { + "name": "int", + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3049, + "module": "_utils.byte_size", + "name": "__post_init__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/byte_size.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 18 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 3050, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__post_init__", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "name": "classmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3051, + "module": "_utils.byte_size", + "name": "validate", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/byte_size.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 23 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 3052, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "validate", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 3053, + "kind": 32768, + "kindString": "Parameter", + "name": "value", + "type": { + "name": "Any", + "type": "reference" + } + } + ], + "type": { + "name": "ByteSize", + "type": "reference", + "target": "3047" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "name": "classmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3054, + "module": "_utils.byte_size", + "name": "from_kb", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/byte_size.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 33 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 3055, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "from_kb", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 3056, + "kind": 32768, + "kindString": "Parameter", + "name": "kb", + "type": { + "name": "float", + "type": "reference" + } + } + ], + "type": { + "name": "ByteSize", + "type": "reference", + "target": "3047" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "name": "classmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3057, + "module": "_utils.byte_size", + "name": "from_mb", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/byte_size.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 37 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 3058, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "from_mb", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 3059, + "kind": 32768, + "kindString": "Parameter", + "name": "mb", + "type": { + "name": "float", + "type": "reference" + } + } + ], + "type": { + "name": "ByteSize", + "type": "reference", + "target": "3047" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "name": "classmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3060, + "module": "_utils.byte_size", + "name": "from_gb", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/byte_size.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 41 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 3061, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "from_gb", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 3062, + "kind": 32768, + "kindString": "Parameter", + "name": "gb", + "type": { + "name": "float", + "type": "reference" + } + } + ], + "type": { + "name": "ByteSize", + "type": "reference", + "target": "3047" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "name": "classmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3063, + "module": "_utils.byte_size", + "name": "from_tb", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/byte_size.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 45 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 3064, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "from_tb", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 3065, + "kind": 32768, + "kindString": "Parameter", + "name": "tb", + "type": { + "name": "float", + "type": "reference" + } + } + ], + "type": { + "name": "ByteSize", + "type": "reference", + "target": "3047" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3066, + "module": "_utils.byte_size", + "name": "to_kb", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/byte_size.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 48 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 3067, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "to_kb", + "parameters": [], + "type": { + "name": "float", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3068, + "module": "_utils.byte_size", + "name": "to_mb", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/byte_size.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 51 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 3069, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "to_mb", + "parameters": [], + "type": { + "name": "float", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3070, + "module": "_utils.byte_size", + "name": "to_gb", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/byte_size.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 54 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 3071, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "to_gb", + "parameters": [], + "type": { + "name": "float", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3072, + "module": "_utils.byte_size", + "name": "to_tb", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/byte_size.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 57 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 3073, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "to_tb", + "parameters": [], + "type": { + "name": "float", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3074, + "module": "_utils.byte_size", + "name": "__str__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/byte_size.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 60 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 3075, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__str__", + "parameters": [], + "type": { + "name": "str", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3076, + "module": "_utils.byte_size", + "name": "__eq__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/byte_size.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 71 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 3077, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__eq__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 3078, + "kind": 32768, + "kindString": "Parameter", + "name": "other", + "type": { + "name": "object", + "type": "reference" + } + } + ], + "type": { + "name": "bool", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3079, + "module": "_utils.byte_size", + "name": "__lt__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/byte_size.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 76 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 3080, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__lt__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 3081, + "kind": 32768, + "kindString": "Parameter", + "name": "other", + "type": { + "name": "object", + "type": "reference" + } + } + ], + "type": { + "name": "bool", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3082, + "module": "_utils.byte_size", + "name": "__le__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/byte_size.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 81 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 3083, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__le__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 3084, + "kind": 32768, + "kindString": "Parameter", + "name": "other", + "type": { + "name": "object", + "type": "reference" + } + } + ], + "type": { + "name": "bool", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3085, + "module": "_utils.byte_size", + "name": "__gt__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/byte_size.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 86 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 3086, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__gt__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 3087, + "kind": 32768, + "kindString": "Parameter", + "name": "other", + "type": { + "name": "object", + "type": "reference" + } + } + ], + "type": { + "name": "bool", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3088, + "module": "_utils.byte_size", + "name": "__ge__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/byte_size.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 91 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 3089, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__ge__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 3090, + "kind": 32768, + "kindString": "Parameter", + "name": "other", + "type": { + "name": "object", + "type": "reference" + } + } + ], + "type": { + "name": "bool", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3091, + "module": "_utils.byte_size", + "name": "__add__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/byte_size.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 96 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 3092, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__add__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 3093, + "kind": 32768, + "kindString": "Parameter", + "name": "other", + "type": { + "name": "object", + "type": "reference" + } + } + ], + "type": { + "name": "ByteSize", + "type": "reference", + "target": "3047" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3094, + "module": "_utils.byte_size", + "name": "__sub__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/byte_size.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 101 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 3095, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__sub__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 3096, + "kind": 32768, + "kindString": "Parameter", + "name": "other", + "type": { + "name": "object", + "type": "reference" + } + } + ], + "type": { + "name": "ByteSize", + "type": "reference", + "target": "3047" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3097, + "module": "_utils.byte_size", + "name": "__mul__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/byte_size.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 109 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 3098, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__mul__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 3099, + "kind": 32768, + "kindString": "Parameter", + "name": "other", + "type": { + "name": "object", + "type": "reference" + } + } + ], + "type": { + "name": "ByteSize", + "type": "reference", + "target": "3047" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3100, + "module": "_utils.byte_size", + "name": "__truediv__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/byte_size.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 115 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 3101, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__truediv__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 3102, + "kind": 32768, + "kindString": "Parameter", + "name": "other", + "type": { + "name": "object", + "type": "reference" + } + } + ], + "type": { + "name": "float", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3103, + "module": "_utils.byte_size", + "name": "__rmul__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/byte_size.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 123 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 3104, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__rmul__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 3105, + "kind": 32768, + "kindString": "Parameter", + "name": "other", + "type": { + "name": "object", + "type": "reference" + } + } + ], + "type": { + "name": "ByteSize", + "type": "reference", + "target": "3047" + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Represents a byte size." + } + ] + }, + "decorations": [ + { + "args": "(frozen=True)", + "name": "dataclass" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 3091, + 3076, + 3088, + 3085, + 3082, + 3079, + 3097, + 3049, + 3103, + 3074, + 3094, + 3100, + 3060, + 3054, + 3057, + 3063, + 3070, + 3066, + 3068, + 3072, + 3051 + ], + "title": "Methods" + }, + { + "children": [ + 3048 + ], + "title": "Properties" + } + ], + "id": 3047, + "module": "_utils.byte_size", + "name": "ByteSize", + "parsedDocstring": { + "text": "Represents a byte size." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/byte_size.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 13 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3106, + "module": "_utils.blocked", + "name": "CLOUDFLARE_RETRY_CSS_SELECTORS", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/blocked.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 5 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "CSS selectors for elements that should trigger a retry, as the crawler is likely getting blocked." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3107, + "module": "_utils.blocked", + "name": "RETRY_CSS_SELECTORS", + "parsedDocstring": { + "text": "CSS selectors for elements that should trigger a retry, as the crawler is likely getting blocked." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/blocked.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 9 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Content of proxy errors that should trigger a retry, as the proxy is likely getting blocked / is malfunctioning." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3108, + "module": "_utils.blocked", + "name": "ROTATE_PROXY_ERRORS", + "parsedDocstring": { + "text": "Content of proxy errors that should trigger a retry, as the proxy is likely getting blocked / is malfunctioning." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_utils/blocked.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 18 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3109, + "module": "_autoscaling.system_status", + "name": "logger", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_autoscaling/system_status.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 17 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3111, + "module": "_autoscaling.system_status", + "name": "__init__", + "parsedDocstring": { + "text": "Initialize a new instance.\n", + "args": { + "snapshotter": "The `Snapshotter` instance to be queried for `SystemStatus`.", + "max_snapshot_age": "Defines max age of snapshots used in the `SystemStatus.get_current_system_info`\nmeasurement.", + "cpu_overload_threshold": "Sets the threshold of overloaded snapshots in the CPU sample.\nIf the sample exceeds this threshold, the system will be considered overloaded.", + "memory_overload_threshold": "Sets the threshold of overloaded snapshots in the memory sample.\nIf the sample exceeds this threshold, the system will be considered overloaded.", + "event_loop_overload_threshold": "Sets the threshold of overloaded snapshots in the event loop sample.\nIf the sample exceeds this threshold, the system will be considered overloaded.", + "client_overload_threshold": "Sets the threshold of overloaded snapshots in the Client sample.\nIf the sample exceeds this threshold, the system will be considered overloaded." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_autoscaling/system_status.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 39 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n" + } + ] + }, + "flags": {}, + "id": 3112, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The `Snapshotter` instance to be queried for `SystemStatus`." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 3113, + "kind": 32768, + "kindString": "Parameter", + "name": "snapshotter", + "type": { + "name": "Snapshotter", + "type": "reference", + "target": "3125" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Defines max age of snapshots used in the `SystemStatus.get_current_system_info`\nmeasurement." + } + ] + }, + "defaultValue": "timedelta(seconds=5)", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 3114, + "kind": 32768, + "kindString": "Parameter", + "name": "max_snapshot_age", + "type": { + "name": "timedelta", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Sets the threshold of overloaded snapshots in the CPU sample.\nIf the sample exceeds this threshold, the system will be considered overloaded." + } + ] + }, + "defaultValue": "0.4", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 3115, + "kind": 32768, + "kindString": "Parameter", + "name": "cpu_overload_threshold", + "type": { + "name": "float", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Sets the threshold of overloaded snapshots in the memory sample.\nIf the sample exceeds this threshold, the system will be considered overloaded." + } + ] + }, + "defaultValue": "0.2", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 3116, + "kind": 32768, + "kindString": "Parameter", + "name": "memory_overload_threshold", + "type": { + "name": "float", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Sets the threshold of overloaded snapshots in the event loop sample.\nIf the sample exceeds this threshold, the system will be considered overloaded." + } + ] + }, + "defaultValue": "0.6", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 3117, + "kind": 32768, + "kindString": "Parameter", + "name": "event_loop_overload_threshold", + "type": { + "name": "float", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Sets the threshold of overloaded snapshots in the Client sample.\nIf the sample exceeds this threshold, the system will be considered overloaded." + } + ] + }, + "defaultValue": "0.3", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 3118, + "kind": 32768, + "kindString": "Parameter", + "name": "client_overload_threshold", + "type": { + "name": "float", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Retrieve and evaluates the current status of system resources.\n\nConsiders snapshots within the `_max_snapshot_age` timeframe and determines if the system is currently\noverloaded based on predefined thresholds for each resource type.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3119, + "module": "_autoscaling.system_status", + "name": "get_current_system_info", + "parsedDocstring": { + "text": "Retrieve and evaluates the current status of system resources.\n\nConsiders snapshots within the `_max_snapshot_age` timeframe and determines if the system is currently\noverloaded based on predefined thresholds for each resource type.\n", + "returns": "An object representing the current system status." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_autoscaling/system_status.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 71 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "An object representing the current system status." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Retrieve and evaluates the current status of system resources.\n\nConsiders snapshots within the `_max_snapshot_age` timeframe and determines if the system is currently\noverloaded based on predefined thresholds for each resource type.\n" + } + ] + }, + "flags": {}, + "id": 3120, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "get_current_system_info", + "parameters": [], + "type": { + "name": "SystemInfo", + "type": "reference", + "target": "3183" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Retrieve and evaluates the historical status of system resources.\n\nConsiders the entire history of snapshots from the Snapshotter to assess long-term system performance and\ndetermines if the system has been historically overloaded.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3121, + "module": "_autoscaling.system_status", + "name": "get_historical_system_info", + "parsedDocstring": { + "text": "Retrieve and evaluates the historical status of system resources.\n\nConsiders the entire history of snapshots from the Snapshotter to assess long-term system performance and\ndetermines if the system has been historically overloaded.\n", + "returns": "An object representing the historical system status." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_autoscaling/system_status.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 82 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "An object representing the historical system status." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Retrieve and evaluates the historical status of system resources.\n\nConsiders the entire history of snapshots from the Snapshotter to assess long-term system performance and\ndetermines if the system has been historically overloaded.\n" + } + ] + }, + "flags": {}, + "id": 3122, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "get_historical_system_info", + "parameters": [], + "type": { + "name": "SystemInfo", + "type": "reference", + "target": "3183" + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Provides a simple interface for evaluating system resource usage from snapshots collected by `Snapshotter`.\n\nThis class aggregates and interprets snapshots from a Snapshotter instance to evaluate the current and historical\nstatus of system resources like CPU, memory, event loop, and client API usage. It exposes two methods\n`get_current_system_info` and `get_historical_system_info`. The system information is computed using a weighted\naverage of overloaded messages in the snapshots, with the weights being the time intervals between the snapshots.\nEach resource is computed separately, and the system is considered as overloaded whenever at least one resource\nis overloaded.\n\n`get_current_system_info` returns a `SystemInfo` data structure that represents the current status\nof the system. The length of the current timeframe in seconds is configurable by the `max_snapshot_age` option\nand represents the max age of snapshots to be considered for the computation.\n\n`SystemStatus.get_historical_system_info` returns a `SystemInfo` that represents the long-term status of the system.\nIt considers the full snapshot history available in the `Snapshotter` instance." + } + ] + }, + "decorations": [ + { + "args": "('Classes')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 3111, + 3119, + 3121 + ], + "title": "Methods" + } + ], + "id": 3110, + "module": "_autoscaling.system_status", + "name": "SystemStatus", + "parsedDocstring": { + "text": "Provides a simple interface for evaluating system resource usage from snapshots collected by `Snapshotter`.\n\nThis class aggregates and interprets snapshots from a Snapshotter instance to evaluate the current and historical\nstatus of system resources like CPU, memory, event loop, and client API usage. It exposes two methods\n`get_current_system_info` and `get_historical_system_info`. The system information is computed using a weighted\naverage of overloaded messages in the snapshots, with the weights being the time intervals between the snapshots.\nEach resource is computed separately, and the system is considered as overloaded whenever at least one resource\nis overloaded.\n\n`get_current_system_info` returns a `SystemInfo` data structure that represents the current status\nof the system. The length of the current timeframe in seconds is configurable by the `max_snapshot_age` option\nand represents the max age of snapshots to be considered for the computation.\n\n`SystemStatus.get_historical_system_info` returns a `SystemInfo` that represents the long-term status of the system.\nIt considers the full snapshot history available in the `Snapshotter` instance." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_autoscaling/system_status.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 21 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3123, + "module": "_autoscaling.snapshotter", + "name": "logger", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_autoscaling/snapshotter.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 26 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3124, + "module": "_autoscaling.snapshotter", + "name": "T", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_autoscaling/snapshotter.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 28 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n\nIn most cases, you should use the `from_config` constructor to create a new instance based on\nthe provided configuration.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3126, + "module": "_autoscaling.snapshotter", + "name": "__init__", + "parsedDocstring": { + "text": "Initialize a new instance.\n\nIn most cases, you should use the `from_config` constructor to create a new instance based on\nthe provided configuration.\n", + "args": { + "max_used_cpu_ratio": "Sets the ratio, defining the maximum CPU usage. When the CPU usage is higher than\nthe provided ratio, the CPU is considered overloaded.", + "max_used_memory_ratio": "Sets the ratio, defining the maximum ratio of memory usage. When the memory usage\nis higher than the provided ratio of `max_memory_size`, the memory is considered overloaded.", + "max_event_loop_delay": "Sets the maximum delay of the event loop. When the delay is higher than the provided\nvalue, the event loop is considered overloaded.", + "max_client_errors": "Sets the maximum number of client errors (HTTP 429). When the number of client errors\nis higher than the provided number, the client is considered overloaded.", + "max_memory_size": "Sets the maximum amount of system memory to be used by the `AutoscaledPool`." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_autoscaling/snapshotter.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 59 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n\nIn most cases, you should use the `from_config` constructor to create a new instance based on\nthe provided configuration.\n" + } + ] + }, + "flags": {}, + "id": 3127, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Sets the ratio, defining the maximum CPU usage. When the CPU usage is higher than\nthe provided ratio, the CPU is considered overloaded." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 3128, + "kind": 32768, + "kindString": "Parameter", + "name": "max_used_cpu_ratio", + "type": { + "name": "float", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Sets the ratio, defining the maximum ratio of memory usage. When the memory usage\nis higher than the provided ratio of `max_memory_size`, the memory is considered overloaded." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 3129, + "kind": 32768, + "kindString": "Parameter", + "name": "max_used_memory_ratio", + "type": { + "name": "float", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Sets the maximum delay of the event loop. When the delay is higher than the provided\nvalue, the event loop is considered overloaded." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 3130, + "kind": 32768, + "kindString": "Parameter", + "name": "max_event_loop_delay", + "type": { + "name": "timedelta", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Sets the maximum number of client errors (HTTP 429). When the number of client errors\nis higher than the provided number, the client is considered overloaded." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 3131, + "kind": 32768, + "kindString": "Parameter", + "name": "max_client_errors", + "type": { + "name": "int", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Sets the maximum amount of system memory to be used by the `AutoscaledPool`." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 3132, + "kind": 32768, + "kindString": "Parameter", + "name": "max_memory_size", + "type": { + "name": "ByteSize", + "type": "reference", + "target": "3047" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance based on the provided `Configuration`.\n" + } + ] + }, + "decorations": [ + { + "name": "classmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3133, + "module": "_autoscaling.snapshotter", + "name": "from_config", + "parsedDocstring": { + "text": "Initialize a new instance based on the provided `Configuration`.\n", + "args": { + "config": "The `Configuration` instance. Uses the global (default) one if not provided." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_autoscaling/snapshotter.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 104 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance based on the provided `Configuration`.\n" + } + ] + }, + "flags": {}, + "id": 3134, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "from_config", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The `Configuration` instance. Uses the global (default) one if not provided." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 3135, + "kind": 32768, + "kindString": "Parameter", + "name": "config", + "type": { + "name": "Configuration | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Configuration", + "target": "93" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "Snapshotter", + "type": "reference", + "target": "3125" + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Indicate whether the context is active." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 3136, + "module": "_autoscaling.snapshotter", + "name": "active", + "parsedDocstring": { + "text": "Indicate whether the context is active." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_autoscaling/snapshotter.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 134 + } + ], + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Start capturing snapshots at configured intervals.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3137, + "module": "_autoscaling.snapshotter", + "name": "__aenter__", + "parsedDocstring": { + "text": "Start capturing snapshots at configured intervals.\n" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_autoscaling/snapshotter.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 138 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Start capturing snapshots at configured intervals.\n" + } + ] + }, + "flags": {}, + "id": 3138, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "__aenter__", + "parameters": [], + "type": { + "name": "Snapshotter", + "type": "reference", + "target": "3125" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Stop all resource capturing.\n\nThis method stops capturing snapshots of system resources (CPU, memory, event loop, and client information).\nIt should be called to terminate resource capturing when it is no longer needed.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3139, + "module": "_autoscaling.snapshotter", + "name": "__aexit__", + "parsedDocstring": { + "text": "Stop all resource capturing.\n\nThis method stops capturing snapshots of system resources (CPU, memory, event loop, and client information).\nIt should be called to terminate resource capturing when it is no longer needed.\n" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_autoscaling/snapshotter.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 155 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Stop all resource capturing.\n\nThis method stops capturing snapshots of system resources (CPU, memory, event loop, and client information).\nIt should be called to terminate resource capturing when it is no longer needed.\n" + } + ] + }, + "flags": {}, + "id": 3140, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "__aexit__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 3141, + "kind": 32768, + "kindString": "Parameter", + "name": "exc_type", + "type": { + "name": "type[BaseException] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "type", + "typeArguments": [ + { + "type": "reference", + "name": "BaseException" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 3142, + "kind": 32768, + "kindString": "Parameter", + "name": "exc_value", + "type": { + "name": "BaseException | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "BaseException" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 3143, + "kind": 32768, + "kindString": "Parameter", + "name": "exc_traceback", + "type": { + "name": "TracebackType | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "TracebackType" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return a sample of the latest memory snapshots.\n" + } + ] + }, + "decorations": [ + { + "name": "ensure_context" + } + ], + "flags": {}, + "groups": [], + "id": 3144, + "module": "_autoscaling.snapshotter", + "name": "get_memory_sample", + "parsedDocstring": { + "text": "Return a sample of the latest memory snapshots.\n", + "args": { + "duration": "The duration of the sample from the latest snapshot. If omitted, it returns a full history.\n" + }, + "returns": "A sample of memory snapshots." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_autoscaling/snapshotter.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 180 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "A sample of memory snapshots." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Return a sample of the latest memory snapshots.\n" + } + ] + }, + "flags": {}, + "id": 3145, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "get_memory_sample", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The duration of the sample from the latest snapshot. If omitted, it returns a full history.\n" + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 3146, + "kind": 32768, + "kindString": "Parameter", + "name": "duration", + "type": { + "name": "timedelta | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "timedelta" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "list", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Snapshot", + "target": "3215" + } + ], + "target": "866" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return a sample of the latest event loop snapshots.\n" + } + ] + }, + "decorations": [ + { + "name": "ensure_context" + } + ], + "flags": {}, + "groups": [], + "id": 3147, + "module": "_autoscaling.snapshotter", + "name": "get_event_loop_sample", + "parsedDocstring": { + "text": "Return a sample of the latest event loop snapshots.\n", + "args": { + "duration": "The duration of the sample from the latest snapshot. If omitted, it returns a full history.\n" + }, + "returns": "A sample of event loop snapshots." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_autoscaling/snapshotter.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 193 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "A sample of event loop snapshots." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Return a sample of the latest event loop snapshots.\n" + } + ] + }, + "flags": {}, + "id": 3148, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "get_event_loop_sample", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The duration of the sample from the latest snapshot. If omitted, it returns a full history.\n" + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 3149, + "kind": 32768, + "kindString": "Parameter", + "name": "duration", + "type": { + "name": "timedelta | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "timedelta" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "list", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Snapshot", + "target": "3215" + } + ], + "target": "866" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return a sample of the latest CPU snapshots.\n" + } + ] + }, + "decorations": [ + { + "name": "ensure_context" + } + ], + "flags": {}, + "groups": [], + "id": 3150, + "module": "_autoscaling.snapshotter", + "name": "get_cpu_sample", + "parsedDocstring": { + "text": "Return a sample of the latest CPU snapshots.\n", + "args": { + "duration": "The duration of the sample from the latest snapshot. If omitted, it returns a full history.\n" + }, + "returns": "A sample of CPU snapshots." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_autoscaling/snapshotter.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 206 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "A sample of CPU snapshots." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Return a sample of the latest CPU snapshots.\n" + } + ] + }, + "flags": {}, + "id": 3151, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "get_cpu_sample", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The duration of the sample from the latest snapshot. If omitted, it returns a full history.\n" + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 3152, + "kind": 32768, + "kindString": "Parameter", + "name": "duration", + "type": { + "name": "timedelta | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "timedelta" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "list", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Snapshot", + "target": "3215" + } + ], + "target": "866" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return a sample of the latest client snapshots.\n" + } + ] + }, + "decorations": [ + { + "name": "ensure_context" + } + ], + "flags": {}, + "groups": [], + "id": 3153, + "module": "_autoscaling.snapshotter", + "name": "get_client_sample", + "parsedDocstring": { + "text": "Return a sample of the latest client snapshots.\n", + "args": { + "duration": "The duration of the sample from the latest snapshot. If omitted, it returns a full history.\n" + }, + "returns": "A sample of client snapshots." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_autoscaling/snapshotter.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 219 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "A sample of client snapshots." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Return a sample of the latest client snapshots.\n" + } + ] + }, + "flags": {}, + "id": 3154, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "get_client_sample", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The duration of the sample from the latest snapshot. If omitted, it returns a full history.\n" + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 3155, + "kind": 32768, + "kindString": "Parameter", + "name": "duration", + "type": { + "name": "timedelta | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "timedelta" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "list", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Snapshot", + "target": "3215" + } + ], + "target": "866" + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Monitors and logs system resource usage at predefined intervals for performance optimization.\n\nThe class monitors and records the state of various system resources (CPU, memory, event loop, and client API)\nat predefined intervals. This continuous monitoring helps in identifying resource overloads and ensuring optimal\nperformance of the application. It is utilized in the `AutoscaledPool` module to adjust task allocation\ndynamically based on the current demand and system load." + } + ] + }, + "decorations": [ + { + "args": "('Classes')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 3137, + 3139, + 3126, + 3133, + 3153, + 3150, + 3147, + 3144 + ], + "title": "Methods" + }, + { + "children": [ + 3136 + ], + "title": "Properties" + } + ], + "id": 3125, + "module": "_autoscaling.snapshotter", + "name": "Snapshotter", + "parsedDocstring": { + "text": "Monitors and logs system resource usage at predefined intervals for performance optimization.\n\nThe class monitors and records the state of various system resources (CPU, memory, event loop, and client API)\nat predefined intervals. This continuous monitoring helps in identifying resource overloads and ensuring optimal\nperformance of the application. It is utilized in the `AutoscaledPool` module to adjust task allocation\ndynamically based on the current demand and system load." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_autoscaling/snapshotter.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 32 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3156, + "module": "_autoscaling.autoscaled_pool", + "name": "logger", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_autoscaling/autoscaled_pool.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 21 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Raised when an AutoscaledPool run is aborted. Not for direct use." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3157, + "module": "_autoscaling.autoscaled_pool", + "name": "AbortError", + "parsedDocstring": { + "text": "Raised when an AutoscaledPool run is aborted. Not for direct use." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_autoscaling/autoscaled_pool.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 24 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3159, + "module": "_autoscaling.autoscaled_pool", + "name": "__init__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_autoscaling/autoscaled_pool.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 29 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 3160, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 3159 + ], + "title": "Methods" + } + ], + "id": 3158, + "module": "_autoscaling.autoscaled_pool", + "name": "_AutoscaledPoolRun", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_autoscaling/autoscaled_pool.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 28 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3162, + "module": "_autoscaling.autoscaled_pool", + "name": "__init__", + "parsedDocstring": { + "text": "Initialize a new instance.\n", + "args": { + "system_status": "Provides data about system utilization (load).", + "concurrency_settings": "Settings of concurrency levels.", + "run_task_function": "A function that performs an asynchronous resource-intensive task.", + "is_task_ready_function": "A function that indicates whether `run_task_function` should be called. This\nfunction is called every time there is free capacity for a new task and it should indicate whether\nit should start a new task or not by resolving to either `True` or `False`. Besides its obvious use,\nit is also useful for task throttling to save resources.", + "is_finished_function": "A function that is called only when there are no tasks to be processed. If it\nresolves to `True` then the pool's run finishes. Being called only when there are no tasks being\nprocessed means that as long as `is_task_ready_function` keeps resolving to `True`,\n`is_finished_function` will never be called. To abort a run, use the `abort` method." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_autoscaling/autoscaled_pool.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 64 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n" + } + ] + }, + "flags": {}, + "id": 3163, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Provides data about system utilization (load)." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 3164, + "kind": 32768, + "kindString": "Parameter", + "name": "system_status", + "type": { + "name": "SystemStatus", + "type": "reference", + "target": "3110" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Settings of concurrency levels." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 3165, + "kind": 32768, + "kindString": "Parameter", + "name": "concurrency_settings", + "type": { + "name": "ConcurrencySettings | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ConcurrencySettings", + "target": "146" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A function that performs an asynchronous resource-intensive task." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 3166, + "kind": 32768, + "kindString": "Parameter", + "name": "run_task_function", + "type": { + "name": "Callable", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "[]" + }, + { + "type": "reference", + "name": "Awaitable" + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A function that indicates whether `run_task_function` should be called. This\nfunction is called every time there is free capacity for a new task and it should indicate whether\nit should start a new task or not by resolving to either `True` or `False`. Besides its obvious use,\nit is also useful for task throttling to save resources." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 3167, + "kind": 32768, + "kindString": "Parameter", + "name": "is_task_ready_function", + "type": { + "name": "Callable", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "[]" + }, + { + "type": "reference", + "name": "Awaitable", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A function that is called only when there are no tasks to be processed. If it\nresolves to `True` then the pool's run finishes. Being called only when there are no tasks being\nprocessed means that as long as `is_task_ready_function` keeps resolving to `True`,\n`is_finished_function` will never be called. To abort a run, use the `abort` method." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 3168, + "kind": 32768, + "kindString": "Parameter", + "name": "is_finished_function", + "type": { + "name": "Callable", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "[]" + }, + { + "type": "reference", + "name": "Awaitable", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Start the autoscaled pool and return when all tasks are completed and `is_finished_function` returns True.\n\nIf there is an exception in one of the tasks, it will be re-raised." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3169, + "module": "_autoscaling.autoscaled_pool", + "name": "run", + "parsedDocstring": { + "text": "Start the autoscaled pool and return when all tasks are completed and `is_finished_function` returns True.\n\nIf there is an exception in one of the tasks, it will be re-raised." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_autoscaling/autoscaled_pool.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 105 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Start the autoscaled pool and return when all tasks are completed and `is_finished_function` returns True.\n\nIf there is an exception in one of the tasks, it will be re-raised." + } + ] + }, + "flags": {}, + "id": 3170, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "run", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Interrupt the autoscaled pool and all the tasks in progress." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3171, + "module": "_autoscaling.autoscaled_pool", + "name": "abort", + "parsedDocstring": { + "text": "Interrupt the autoscaled pool and all the tasks in progress." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_autoscaling/autoscaled_pool.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 155 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Interrupt the autoscaled pool and all the tasks in progress." + } + ] + }, + "flags": {}, + "id": 3172, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "abort", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Pause the autoscaled pool so that it does not start new tasks." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3173, + "module": "_autoscaling.autoscaled_pool", + "name": "pause", + "parsedDocstring": { + "text": "Pause the autoscaled pool so that it does not start new tasks." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_autoscaling/autoscaled_pool.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 163 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Pause the autoscaled pool so that it does not start new tasks." + } + ] + }, + "flags": {}, + "id": 3174, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "pause", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Resume a paused autoscaled pool so that it continues starting new tasks." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3175, + "module": "_autoscaling.autoscaled_pool", + "name": "resume", + "parsedDocstring": { + "text": "Resume a paused autoscaled pool so that it continues starting new tasks." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_autoscaling/autoscaled_pool.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 167 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Resume a paused autoscaled pool so that it continues starting new tasks." + } + ] + }, + "flags": {}, + "id": 3176, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "resume", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The current desired concurrency, possibly updated by the pool according to system load." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 3177, + "module": "_autoscaling.autoscaled_pool", + "name": "desired_concurrency", + "parsedDocstring": { + "text": "The current desired concurrency, possibly updated by the pool according to system load." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_autoscaling/autoscaled_pool.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 172 + } + ], + "type": { + "name": "int", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The number of concurrent tasks in progress." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 3178, + "module": "_autoscaling.autoscaled_pool", + "name": "current_concurrency", + "parsedDocstring": { + "text": "The number of concurrent tasks in progress." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_autoscaling/autoscaled_pool.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 177 + } + ], + "type": { + "name": "int", + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Manages a pool of asynchronous resource-intensive tasks that are executed in parallel.\n\nThe pool only starts new tasks if there is enough free CPU and memory available. If an exception is thrown in\nany of the tasks, it is propagated and the pool is stopped." + } + ] + }, + "decorations": [ + { + "args": "('Classes')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 3162, + 3171, + 3173, + 3175, + 3169 + ], + "title": "Methods" + }, + { + "children": [ + 3178, + 3177 + ], + "title": "Properties" + } + ], + "id": 3161, + "module": "_autoscaling.autoscaled_pool", + "name": "AutoscaledPool", + "parsedDocstring": { + "text": "Manages a pool of asynchronous resource-intensive tasks that are executed in parallel.\n\nThe pool only starts new tasks if there is enough free CPU and memory available. If an exception is thrown in\nany of the tasks, it is propagated and the pool is stopped." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_autoscaling/autoscaled_pool.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 39 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The maximum ratio of overloaded and non-overloaded samples. If the actual ratio exceeds this value,\nthe resource is considered as overloaded." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3180, + "module": "_autoscaling._types", + "name": "limit_ratio", + "parsedDocstring": { + "text": "The maximum ratio of overloaded and non-overloaded samples. If the actual ratio exceeds this value,\nthe resource is considered as overloaded." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_autoscaling/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 15 + } + ], + "type": { + "name": "float", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The actual ratio of overloaded and non-overloaded samples." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3181, + "module": "_autoscaling._types", + "name": "actual_ratio", + "parsedDocstring": { + "text": "The actual ratio of overloaded and non-overloaded samples." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_autoscaling/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 19 + } + ], + "type": { + "name": "float", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Indicate whether the resource is currently overloaded." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 3182, + "module": "_autoscaling._types", + "name": "is_overloaded", + "parsedDocstring": { + "text": "Indicate whether the resource is currently overloaded." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_autoscaling/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 23 + } + ], + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Represent the load ratio of a resource." + } + ] + }, + "decorations": [ + { + "name": "dataclass" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 3181, + 3182, + 3180 + ], + "title": "Properties" + } + ], + "id": 3179, + "module": "_autoscaling._types", + "name": "LoadRatioInfo", + "parsedDocstring": { + "text": "Represent the load ratio of a resource." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_autoscaling/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 12 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The CPU load ratio." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3184, + "module": "_autoscaling._types", + "name": "cpu_info", + "parsedDocstring": { + "text": "The CPU load ratio." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_autoscaling/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 32 + } + ], + "type": { + "name": "LoadRatioInfo", + "type": "reference", + "target": "3179" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The memory load ratio." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3185, + "module": "_autoscaling._types", + "name": "memory_info", + "parsedDocstring": { + "text": "The memory load ratio." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_autoscaling/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 35 + } + ], + "type": { + "name": "LoadRatioInfo", + "type": "reference", + "target": "3179" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The event loop load ratio." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3186, + "module": "_autoscaling._types", + "name": "event_loop_info", + "parsedDocstring": { + "text": "The event loop load ratio." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_autoscaling/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 38 + } + ], + "type": { + "name": "LoadRatioInfo", + "type": "reference", + "target": "3179" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The client load ratio." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3187, + "module": "_autoscaling._types", + "name": "client_info", + "parsedDocstring": { + "text": "The client load ratio." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_autoscaling/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 41 + } + ], + "type": { + "name": "LoadRatioInfo", + "type": "reference", + "target": "3179" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The time at which the system load information was measured." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3188, + "module": "_autoscaling._types", + "name": "created_at", + "parsedDocstring": { + "text": "The time at which the system load information was measured." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_autoscaling/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 44 + } + ], + "type": { + "name": "datetime", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Indicate whether the system is currently idle or overloaded." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 3189, + "module": "_autoscaling._types", + "name": "is_system_idle", + "parsedDocstring": { + "text": "Indicate whether the system is currently idle or overloaded." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_autoscaling/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 48 + } + ], + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get a string representation of the system info." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3190, + "module": "_autoscaling._types", + "name": "__str__", + "parsedDocstring": { + "text": "Get a string representation of the system info." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_autoscaling/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 57 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get a string representation of the system info." + } + ] + }, + "flags": {}, + "id": 3191, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__str__", + "parameters": [], + "type": { + "name": "str", + "type": "reference" + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Represent the current status of the system." + } + ] + }, + "decorations": [ + { + "name": "dataclass" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 3190 + ], + "title": "Methods" + }, + { + "children": [ + 3187, + 3184, + 3188, + 3186, + 3189, + 3185 + ], + "title": "Properties" + } + ], + "id": 3183, + "module": "_autoscaling._types", + "name": "SystemInfo", + "parsedDocstring": { + "text": "Represent the current status of the system." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_autoscaling/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 29 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The ratio of CPU currently in use." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3193, + "module": "_autoscaling._types", + "name": "used_ratio", + "parsedDocstring": { + "text": "The ratio of CPU currently in use." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_autoscaling/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 72 + } + ], + "type": { + "name": "float", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The maximum ratio of CPU that is considered acceptable." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3194, + "module": "_autoscaling._types", + "name": "max_used_ratio", + "parsedDocstring": { + "text": "The maximum ratio of CPU that is considered acceptable." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_autoscaling/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 75 + } + ], + "type": { + "name": "float", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The time at which the system load information was measured." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3195, + "module": "_autoscaling._types", + "name": "created_at", + "parsedDocstring": { + "text": "The time at which the system load information was measured." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_autoscaling/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 78 + } + ], + "type": { + "name": "datetime", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Indicate whether the CPU is considered as overloaded." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 3196, + "module": "_autoscaling._types", + "name": "is_overloaded", + "parsedDocstring": { + "text": "Indicate whether the CPU is considered as overloaded." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_autoscaling/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 82 + } + ], + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A snapshot of CPU usage." + } + ] + }, + "decorations": [ + { + "name": "dataclass" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 3195, + 3196, + 3194, + 3193 + ], + "title": "Properties" + } + ], + "id": 3192, + "module": "_autoscaling._types", + "name": "CpuSnapshot", + "parsedDocstring": { + "text": "A snapshot of CPU usage." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_autoscaling/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 69 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Memory usage of the current Python process and its children." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3198, + "module": "_autoscaling._types", + "name": "current_size", + "parsedDocstring": { + "text": "Memory usage of the current Python process and its children." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_autoscaling/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 91 + } + ], + "type": { + "name": "ByteSize", + "type": "reference", + "target": "3047" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The maximum memory that can be used by `AutoscaledPool`." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3199, + "module": "_autoscaling._types", + "name": "max_memory_size", + "parsedDocstring": { + "text": "The maximum memory that can be used by `AutoscaledPool`." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_autoscaling/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 94 + } + ], + "type": { + "name": "ByteSize", + "type": "reference", + "target": "3047" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The maximum acceptable ratio of `current_size` to `max_memory_size`." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3200, + "module": "_autoscaling._types", + "name": "max_used_memory_ratio", + "parsedDocstring": { + "text": "The maximum acceptable ratio of `current_size` to `max_memory_size`." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_autoscaling/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 97 + } + ], + "type": { + "name": "float", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The time at which the system load information was measured." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3201, + "module": "_autoscaling._types", + "name": "created_at", + "parsedDocstring": { + "text": "The time at which the system load information was measured." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_autoscaling/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 100 + } + ], + "type": { + "name": "datetime", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Indicate whether the memory is considered as overloaded." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 3202, + "module": "_autoscaling._types", + "name": "is_overloaded", + "parsedDocstring": { + "text": "Indicate whether the memory is considered as overloaded." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_autoscaling/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 104 + } + ], + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A snapshot of memory usage." + } + ] + }, + "decorations": [ + { + "name": "dataclass" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 3201, + 3198, + 3202, + 3199, + 3200 + ], + "title": "Properties" + } + ], + "id": 3197, + "module": "_autoscaling._types", + "name": "MemorySnapshot", + "parsedDocstring": { + "text": "A snapshot of memory usage." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_autoscaling/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 88 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The current delay of the event loop." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3204, + "module": "_autoscaling._types", + "name": "delay", + "parsedDocstring": { + "text": "The current delay of the event loop." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_autoscaling/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 113 + } + ], + "type": { + "name": "timedelta", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The maximum delay that is considered acceptable." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3205, + "module": "_autoscaling._types", + "name": "max_delay", + "parsedDocstring": { + "text": "The maximum delay that is considered acceptable." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_autoscaling/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 116 + } + ], + "type": { + "name": "timedelta", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The time at which the system load information was measured." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3206, + "module": "_autoscaling._types", + "name": "created_at", + "parsedDocstring": { + "text": "The time at which the system load information was measured." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_autoscaling/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 119 + } + ], + "type": { + "name": "datetime", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The amount of time by which the delay exceeds the maximum delay." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 3207, + "module": "_autoscaling._types", + "name": "max_delay_exceeded", + "parsedDocstring": { + "text": "The amount of time by which the delay exceeds the maximum delay." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_autoscaling/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 123 + } + ], + "type": { + "name": "timedelta", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Indicate whether the event loop is considered as overloaded." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 3208, + "module": "_autoscaling._types", + "name": "is_overloaded", + "parsedDocstring": { + "text": "Indicate whether the event loop is considered as overloaded." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_autoscaling/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 128 + } + ], + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Snapshot of the state of the event loop." + } + ] + }, + "decorations": [ + { + "name": "dataclass" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 3206, + 3204, + 3208, + 3205, + 3207 + ], + "title": "Properties" + } + ], + "id": 3203, + "module": "_autoscaling._types", + "name": "EventLoopSnapshot", + "parsedDocstring": { + "text": "Snapshot of the state of the event loop." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_autoscaling/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 110 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The number of errors (HTTP 429) that occurred." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3210, + "module": "_autoscaling._types", + "name": "error_count", + "parsedDocstring": { + "text": "The number of errors (HTTP 429) that occurred." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_autoscaling/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 137 + } + ], + "type": { + "name": "int", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The number of new errors (HTTP 429) that occurred since the last snapshot." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3211, + "module": "_autoscaling._types", + "name": "new_error_count", + "parsedDocstring": { + "text": "The number of new errors (HTTP 429) that occurred since the last snapshot." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_autoscaling/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 140 + } + ], + "type": { + "name": "int", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The maximum number of errors that is considered acceptable." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3212, + "module": "_autoscaling._types", + "name": "max_error_count", + "parsedDocstring": { + "text": "The maximum number of errors that is considered acceptable." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_autoscaling/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 143 + } + ], + "type": { + "name": "int", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The time at which the system load information was measured." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3213, + "module": "_autoscaling._types", + "name": "created_at", + "parsedDocstring": { + "text": "The time at which the system load information was measured." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_autoscaling/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 146 + } + ], + "type": { + "name": "datetime", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Indicate whether the client is considered as overloaded." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 3214, + "module": "_autoscaling._types", + "name": "is_overloaded", + "parsedDocstring": { + "text": "Indicate whether the client is considered as overloaded." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_autoscaling/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 150 + } + ], + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Snapshot of the state of the client." + } + ] + }, + "decorations": [ + { + "name": "dataclass" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 3213, + 3210, + 3214, + 3212, + 3211 + ], + "title": "Properties" + } + ], + "id": 3209, + "module": "_autoscaling._types", + "name": "ClientSnapshot", + "parsedDocstring": { + "text": "Snapshot of the state of the client." + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_autoscaling/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 134 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3215, + "module": "_autoscaling._types", + "name": "Snapshot", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/crawlee/_autoscaling/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 155 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 2583, + 3161, + 2394, + 2336, + 2015, + 2763, + 2349, + 1879, + 662, + 2510, + 2086, + 1974, + 2299, + 1822, + 569, + 2067, + 936, + 2273, + 2709, + 2683, + 2222, + 24, + 2502, + 1787, + 1717, + 507, + 3, + 323, + 1534, + 3125, + 1403, + 3110 + ], + "title": "Classes" + }, + { + "children": [ + 2666, + 2642, + 1322, + 1310, + 1991, + 1919, + 1278, + 1266, + 1770, + 1752, + 1221, + 1209, + 1189 + ], + "title": "Abstract classes" + }, + { + "children": [ + 2565, + 2531, + 2559, + 2393, + 309, + 839, + 2329, + 146, + 93, + 1644, + 823, + 817, + 757, + 1453, + 1966, + 2625, + 1917, + 123, + 1909, + 782, + 786, + 819, + 760, + 772, + 778, + 297, + 2632, + 2266, + 2243, + 2215, + 2179, + 828, + 825, + 15, + 2499, + 398, + 801, + 794, + 807, + 821, + 763, + 1960, + 1575, + 1663, + 1470, + 750, + 153, + 834 + ], + "title": "Data structures" + }, + { + "children": [ + 86, + 81, + 91, + 75, + 70, + 69, + 92, + 76, + 63, + 62, + 61 + ], + "title": "Errors" + }, + { + "children": [ + 2058, + 2060, + 2055, + 2048, + 2051 + ], + "title": "Event payloads" + }, + { + "children": [ + 230, + 2162, + 235, + 261, + 253, + 267, + 273, + 302, + 283, + 290, + 305 + ], + "title": "Functions" + } + ], + "id": 0, + "kind": 1, + "kindString": "Project", + "name": "apify-client", + "sources": [ + { + "character": 0, + "fileName": "src/index.ts", + "line": 1, + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea" + } + ], + "symbolIdMap": { + "1": { + "qualifiedName": "TCrawlingContext", + "sourceFileName": "/crawlee/router.py" + }, + "2": { + "qualifiedName": "RequestHandler", + "sourceFileName": "/crawlee/router.py" + }, + "3": { + "qualifiedName": "Router", + "sourceFileName": "/crawlee/router.py" + }, + "4": { + "qualifiedName": "__init__", + "sourceFileName": "/crawlee/router.py" + }, + "6": { + "qualifiedName": "default_handler", + "sourceFileName": "/crawlee/router.py" + }, + "9": { + "qualifiedName": "handler", + "sourceFileName": "/crawlee/router.py" + }, + "12": { + "qualifiedName": "__call__", + "sourceFileName": "/crawlee/router.py" + }, + "15": { + "qualifiedName": "ProxyInfo", + "sourceFileName": "/crawlee/proxy_configuration.py" + }, + "16": { + "qualifiedName": "url", + "sourceFileName": "/crawlee/proxy_configuration.py" + }, + "17": { + "qualifiedName": "scheme", + "sourceFileName": "/crawlee/proxy_configuration.py" + }, + "18": { + "qualifiedName": "hostname", + "sourceFileName": "/crawlee/proxy_configuration.py" + }, + "19": { + "qualifiedName": "port", + "sourceFileName": "/crawlee/proxy_configuration.py" + }, + "20": { + "qualifiedName": "username", + "sourceFileName": "/crawlee/proxy_configuration.py" + }, + "21": { + "qualifiedName": "password", + "sourceFileName": "/crawlee/proxy_configuration.py" + }, + "22": { + "qualifiedName": "session_id", + "sourceFileName": "/crawlee/proxy_configuration.py" + }, + "23": { + "qualifiedName": "proxy_tier", + "sourceFileName": "/crawlee/proxy_configuration.py" + }, + "24": { + "qualifiedName": "ProxyConfiguration", + "sourceFileName": "/crawlee/proxy_configuration.py" + }, + "25": { + "qualifiedName": "__init__", + "sourceFileName": "/crawlee/proxy_configuration.py" + }, + "30": { + "qualifiedName": "new_proxy_info", + "sourceFileName": "/crawlee/proxy_configuration.py" + }, + "35": { + "qualifiedName": "new_url", + "sourceFileName": "/crawlee/proxy_configuration.py" + }, + "40": { + "qualifiedName": "_ProxyTierTracker", + "sourceFileName": "/crawlee/proxy_configuration.py" + }, + "41": { + "qualifiedName": "__init__", + "sourceFileName": "/crawlee/proxy_configuration.py" + }, + "44": { + "qualifiedName": "all_urls", + "sourceFileName": "/crawlee/proxy_configuration.py" + }, + "45": { + "qualifiedName": "get_tier_urls", + "sourceFileName": "/crawlee/proxy_configuration.py" + }, + "48": { + "qualifiedName": "add_error", + "sourceFileName": "/crawlee/proxy_configuration.py" + }, + "52": { + "qualifiedName": "predict_tier", + "sourceFileName": "/crawlee/proxy_configuration.py" + }, + "55": { + "qualifiedName": "_NewUrlFunction", + "sourceFileName": "/crawlee/proxy_configuration.py" + }, + "56": { + "qualifiedName": "__call__", + "sourceFileName": "/crawlee/proxy_configuration.py" + }, + "60": { + "qualifiedName": "TCrawlingContext", + "sourceFileName": "/crawlee/errors.py" + }, + "61": { + "qualifiedName": "UserDefinedErrorHandlerError", + "sourceFileName": "/crawlee/errors.py" + }, + "62": { + "qualifiedName": "SessionError", + "sourceFileName": "/crawlee/errors.py" + }, + "63": { + "qualifiedName": "ServiceConflictError", + "sourceFileName": "/crawlee/errors.py" + }, + "64": { + "qualifiedName": "__init__", + "sourceFileName": "/crawlee/errors.py" + }, + "69": { + "qualifiedName": "ProxyError", + "sourceFileName": "/crawlee/errors.py" + }, + "70": { + "qualifiedName": "HttpStatusCodeError", + "sourceFileName": "/crawlee/errors.py" + }, + "71": { + "qualifiedName": "__init__", + "sourceFileName": "/crawlee/errors.py" + }, + "75": { + "qualifiedName": "HttpClientStatusCodeError", + "sourceFileName": "/crawlee/errors.py" + }, + "76": { + "qualifiedName": "RequestHandlerError", + "sourceFileName": "/crawlee/errors.py" + }, + "77": { + "qualifiedName": "__init__", + "sourceFileName": "/crawlee/errors.py" + }, + "81": { + "qualifiedName": "ContextPipelineInitializationError", + "sourceFileName": "/crawlee/errors.py" + }, + "82": { + "qualifiedName": "__init__", + "sourceFileName": "/crawlee/errors.py" + }, + "86": { + "qualifiedName": "ContextPipelineFinalizationError", + "sourceFileName": "/crawlee/errors.py" + }, + "87": { + "qualifiedName": "__init__", + "sourceFileName": "/crawlee/errors.py" + }, + "91": { + "qualifiedName": "ContextPipelineInterruptedError", + "sourceFileName": "/crawlee/errors.py" + }, + "92": { + "qualifiedName": "RequestCollisionError", + "sourceFileName": "/crawlee/errors.py" + }, + "93": { + "qualifiedName": "Configuration", + "sourceFileName": "/crawlee/configuration.py" + }, + "94": { + "qualifiedName": "model_config", + "sourceFileName": "/crawlee/configuration.py" + }, + "95": { + "qualifiedName": "internal_timeout", + "sourceFileName": "/crawlee/configuration.py" + }, + "96": { + "qualifiedName": "default_browser_path", + "sourceFileName": "/crawlee/configuration.py" + }, + "97": { + "qualifiedName": "disable_browser_sandbox", + "sourceFileName": "/crawlee/configuration.py" + }, + "98": { + "qualifiedName": "log_level", + "sourceFileName": "/crawlee/configuration.py" + }, + "99": { + "qualifiedName": "default_dataset_id", + "sourceFileName": "/crawlee/configuration.py" + }, + "100": { + "qualifiedName": "default_key_value_store_id", + "sourceFileName": "/crawlee/configuration.py" + }, + "101": { + "qualifiedName": "default_request_queue_id", + "sourceFileName": "/crawlee/configuration.py" + }, + "102": { + "qualifiedName": "purge_on_start", + "sourceFileName": "/crawlee/configuration.py" + }, + "103": { + "qualifiedName": "write_metadata", + "sourceFileName": "/crawlee/configuration.py" + }, + "104": { + "qualifiedName": "persist_storage", + "sourceFileName": "/crawlee/configuration.py" + }, + "105": { + "qualifiedName": "persist_state_interval", + "sourceFileName": "/crawlee/configuration.py" + }, + "106": { + "qualifiedName": "system_info_interval", + "sourceFileName": "/crawlee/configuration.py" + }, + "107": { + "qualifiedName": "max_used_cpu_ratio", + "sourceFileName": "/crawlee/configuration.py" + }, + "108": { + "qualifiedName": "max_used_memory_ratio", + "sourceFileName": "/crawlee/configuration.py" + }, + "109": { + "qualifiedName": "max_event_loop_delay", + "sourceFileName": "/crawlee/configuration.py" + }, + "110": { + "qualifiedName": "max_client_errors", + "sourceFileName": "/crawlee/configuration.py" + }, + "111": { + "qualifiedName": "memory_mbytes", + "sourceFileName": "/crawlee/configuration.py" + }, + "112": { + "qualifiedName": "available_memory_ratio", + "sourceFileName": "/crawlee/configuration.py" + }, + "113": { + "qualifiedName": "storage_dir", + "sourceFileName": "/crawlee/configuration.py" + }, + "114": { + "qualifiedName": "headless", + "sourceFileName": "/crawlee/configuration.py" + }, + "115": { + "qualifiedName": "get_global_configuration", + "sourceFileName": "/crawlee/configuration.py" + }, + "117": { + "qualifiedName": "T", + "sourceFileName": "/crawlee/_types.py" + }, + "118": { + "qualifiedName": "HttpMethod", + "sourceFileName": "/crawlee/_types.py" + }, + "119": { + "qualifiedName": "HttpPayload", + "sourceFileName": "/crawlee/_types.py" + }, + "120": { + "qualifiedName": "RequestTransformAction", + "sourceFileName": "/crawlee/_types.py" + }, + "121": { + "qualifiedName": "EnqueueStrategy", + "sourceFileName": "/crawlee/_types.py" + }, + "122": { + "qualifiedName": "SkippedReason", + "sourceFileName": "/crawlee/_types.py" + }, + "123": { + "qualifiedName": "HttpHeaders", + "sourceFileName": "/crawlee/_types.py" + }, + "124": { + "qualifiedName": "model_config", + "sourceFileName": "/crawlee/_types.py" + }, + "125": { + "qualifiedName": "root", + "sourceFileName": "/crawlee/_types.py" + }, + "126": { + "qualifiedName": "__getitem__", + "sourceFileName": "/crawlee/_types.py" + }, + "129": { + "qualifiedName": "__setitem__", + "sourceFileName": "/crawlee/_types.py" + }, + "133": { + "qualifiedName": "__delitem__", + "sourceFileName": "/crawlee/_types.py" + }, + "136": { + "qualifiedName": "__or__", + "sourceFileName": "/crawlee/_types.py" + }, + "139": { + "qualifiedName": "__ror__", + "sourceFileName": "/crawlee/_types.py" + }, + "142": { + "qualifiedName": "__iter__", + "sourceFileName": "/crawlee/_types.py" + }, + "144": { + "qualifiedName": "__len__", + "sourceFileName": "/crawlee/_types.py" + }, + "146": { + "qualifiedName": "ConcurrencySettings", + "sourceFileName": "/crawlee/_types.py" + }, + "147": { + "qualifiedName": "__init__", + "sourceFileName": "/crawlee/_types.py" + }, + "153": { + "qualifiedName": "StorageTypes", + "sourceFileName": "/crawlee/_types.py" + }, + "154": { + "qualifiedName": "DATASET", + "sourceFileName": "/crawlee/_types.py" + }, + "155": { + "qualifiedName": "KEY_VALUE_STORE", + "sourceFileName": "/crawlee/_types.py" + }, + "156": { + "qualifiedName": "REQUEST_QUEUE", + "sourceFileName": "/crawlee/_types.py" + }, + "157": { + "qualifiedName": "EnqueueLinksKwargs", + "sourceFileName": "/crawlee/_types.py" + }, + "158": { + "qualifiedName": "limit", + "sourceFileName": "/crawlee/_types.py" + }, + "159": { + "qualifiedName": "base_url", + "sourceFileName": "/crawlee/_types.py" + }, + "160": { + "qualifiedName": "strategy", + "sourceFileName": "/crawlee/_types.py" + }, + "161": { + "qualifiedName": "include", + "sourceFileName": "/crawlee/_types.py" + }, + "162": { + "qualifiedName": "exclude", + "sourceFileName": "/crawlee/_types.py" + }, + "163": { + "qualifiedName": "AddRequestsKwargs", + "sourceFileName": "/crawlee/_types.py" + }, + "164": { + "qualifiedName": "requests", + "sourceFileName": "/crawlee/_types.py" + }, + "165": { + "qualifiedName": "PushDataKwargs", + "sourceFileName": "/crawlee/_types.py" + }, + "166": { + "qualifiedName": "PushDataFunctionCall", + "sourceFileName": "/crawlee/_types.py" + }, + "167": { + "qualifiedName": "data", + "sourceFileName": "/crawlee/_types.py" + }, + "168": { + "qualifiedName": "dataset_id", + "sourceFileName": "/crawlee/_types.py" + }, + "169": { + "qualifiedName": "dataset_name", + "sourceFileName": "/crawlee/_types.py" + }, + "170": { + "qualifiedName": "KeyValueStoreInterface", + "sourceFileName": "/crawlee/_types.py" + }, + "171": { + "qualifiedName": "get_value", + "sourceFileName": "/crawlee/_types.py" + }, + "175": { + "qualifiedName": "set_value", + "sourceFileName": "/crawlee/_types.py" + }, + "188": { + "qualifiedName": "KeyValueStoreValue", + "sourceFileName": "/crawlee/_types.py" + }, + "189": { + "qualifiedName": "content", + "sourceFileName": "/crawlee/_types.py" + }, + "190": { + "qualifiedName": "content_type", + "sourceFileName": "/crawlee/_types.py" + }, + "191": { + "qualifiedName": "KeyValueStoreChangeRecords", + "sourceFileName": "/crawlee/_types.py" + }, + "192": { + "qualifiedName": "__init__", + "sourceFileName": "/crawlee/_types.py" + }, + "195": { + "qualifiedName": "set_value", + "sourceFileName": "/crawlee/_types.py" + }, + "200": { + "qualifiedName": "get_value", + "sourceFileName": "/crawlee/_types.py" + }, + "212": { + "qualifiedName": "RequestHandlerRunResult", + "sourceFileName": "/crawlee/_types.py" + }, + "213": { + "qualifiedName": "__init__", + "sourceFileName": "/crawlee/_types.py" + }, + "216": { + "qualifiedName": "add_requests", + "sourceFileName": "/crawlee/_types.py" + }, + "220": { + "qualifiedName": "push_data", + "sourceFileName": "/crawlee/_types.py" + }, + "226": { + "qualifiedName": "get_key_value_store", + "sourceFileName": "/crawlee/_types.py" + }, + "230": { + "qualifiedName": "AddRequestsFunction", + "sourceFileName": "/crawlee/_types.py" + }, + "231": { + "qualifiedName": "__call__", + "sourceFileName": "/crawlee/_types.py" + }, + "235": { + "qualifiedName": "EnqueueLinksFunction", + "sourceFileName": "/crawlee/_types.py" + }, + "236": { + "qualifiedName": "__call__", + "sourceFileName": "/crawlee/_types.py" + }, + "253": { + "qualifiedName": "ExtractLinksFunction", + "sourceFileName": "/crawlee/_types.py" + }, + "254": { + "qualifiedName": "__call__", + "sourceFileName": "/crawlee/_types.py" + }, + "261": { + "qualifiedName": "ExportToFunction", + "sourceFileName": "/crawlee/_types.py" + }, + "262": { + "qualifiedName": "__call__", + "sourceFileName": "/crawlee/_types.py" + }, + "267": { + "qualifiedName": "GetDataFunction", + "sourceFileName": "/crawlee/_types.py" + }, + "268": { + "qualifiedName": "__call__", + "sourceFileName": "/crawlee/_types.py" + }, + "273": { + "qualifiedName": "GetKeyValueStoreFunction", + "sourceFileName": "/crawlee/_types.py" + }, + "274": { + "qualifiedName": "__call__", + "sourceFileName": "/crawlee/_types.py" + }, + "278": { + "qualifiedName": "GetKeyValueStoreFromRequestHandlerFunction", + "sourceFileName": "/crawlee/_types.py" + }, + "279": { + "qualifiedName": "__call__", + "sourceFileName": "/crawlee/_types.py" + }, + "283": { + "qualifiedName": "PushDataFunction", + "sourceFileName": "/crawlee/_types.py" + }, + "284": { + "qualifiedName": "__call__", + "sourceFileName": "/crawlee/_types.py" + }, + "290": { + "qualifiedName": "SendRequestFunction", + "sourceFileName": "/crawlee/_types.py" + }, + "291": { + "qualifiedName": "__call__", + "sourceFileName": "/crawlee/_types.py" + }, + "297": { + "qualifiedName": "PageSnapshot", + "sourceFileName": "/crawlee/_types.py" + }, + "298": { + "qualifiedName": "screenshot", + "sourceFileName": "/crawlee/_types.py" + }, + "299": { + "qualifiedName": "html", + "sourceFileName": "/crawlee/_types.py" + }, + "300": { + "qualifiedName": "__bool__", + "sourceFileName": "/crawlee/_types.py" + }, + "302": { + "qualifiedName": "GetPageSnapshot", + "sourceFileName": "/crawlee/_types.py" + }, + "303": { + "qualifiedName": "__call__", + "sourceFileName": "/crawlee/_types.py" + }, + "305": { + "qualifiedName": "UseStateFunction", + "sourceFileName": "/crawlee/_types.py" + }, + "306": { + "qualifiedName": "__call__", + "sourceFileName": "/crawlee/_types.py" + }, + "309": { + "qualifiedName": "BasicCrawlingContext", + "sourceFileName": "/crawlee/_types.py" + }, + "310": { + "qualifiedName": "request", + "sourceFileName": "/crawlee/_types.py" + }, + "311": { + "qualifiedName": "session", + "sourceFileName": "/crawlee/_types.py" + }, + "312": { + "qualifiedName": "proxy_info", + "sourceFileName": "/crawlee/_types.py" + }, + "313": { + "qualifiedName": "send_request", + "sourceFileName": "/crawlee/_types.py" + }, + "314": { + "qualifiedName": "add_requests", + "sourceFileName": "/crawlee/_types.py" + }, + "315": { + "qualifiedName": "push_data", + "sourceFileName": "/crawlee/_types.py" + }, + "316": { + "qualifiedName": "use_state", + "sourceFileName": "/crawlee/_types.py" + }, + "317": { + "qualifiedName": "get_key_value_store", + "sourceFileName": "/crawlee/_types.py" + }, + "318": { + "qualifiedName": "log", + "sourceFileName": "/crawlee/_types.py" + }, + "319": { + "qualifiedName": "get_snapshot", + "sourceFileName": "/crawlee/_types.py" + }, + "321": { + "qualifiedName": "__hash__", + "sourceFileName": "/crawlee/_types.py" + }, + "323": { + "qualifiedName": "ServiceLocator", + "sourceFileName": "/crawlee/_service_locator.py" + }, + "324": { + "qualifiedName": "__init__", + "sourceFileName": "/crawlee/_service_locator.py" + }, + "326": { + "qualifiedName": "get_configuration", + "sourceFileName": "/crawlee/_service_locator.py" + }, + "328": { + "qualifiedName": "set_configuration", + "sourceFileName": "/crawlee/_service_locator.py" + }, + "331": { + "qualifiedName": "get_event_manager", + "sourceFileName": "/crawlee/_service_locator.py" + }, + "333": { + "qualifiedName": "set_event_manager", + "sourceFileName": "/crawlee/_service_locator.py" + }, + "336": { + "qualifiedName": "get_storage_client", + "sourceFileName": "/crawlee/_service_locator.py" + }, + "338": { + "qualifiedName": "set_storage_client", + "sourceFileName": "/crawlee/_service_locator.py" + }, + "341": { + "qualifiedName": "service_locator", + "sourceFileName": "/crawlee/_service_locator.py" + }, + "342": { + "qualifiedName": "RequestState", + "sourceFileName": "/crawlee/_request.py" + }, + "343": { + "qualifiedName": "UNPROCESSED", + "sourceFileName": "/crawlee/_request.py" + }, + "344": { + "qualifiedName": "BEFORE_NAV", + "sourceFileName": "/crawlee/_request.py" + }, + "345": { + "qualifiedName": "AFTER_NAV", + "sourceFileName": "/crawlee/_request.py" + }, + "346": { + "qualifiedName": "REQUEST_HANDLER", + "sourceFileName": "/crawlee/_request.py" + }, + "347": { + "qualifiedName": "DONE", + "sourceFileName": "/crawlee/_request.py" + }, + "348": { + "qualifiedName": "ERROR_HANDLER", + "sourceFileName": "/crawlee/_request.py" + }, + "349": { + "qualifiedName": "ERROR", + "sourceFileName": "/crawlee/_request.py" + }, + "350": { + "qualifiedName": "SKIPPED", + "sourceFileName": "/crawlee/_request.py" + }, + "351": { + "qualifiedName": "CrawleeRequestData", + "sourceFileName": "/crawlee/_request.py" + }, + "352": { + "qualifiedName": "max_retries", + "sourceFileName": "/crawlee/_request.py" + }, + "353": { + "qualifiedName": "enqueue_strategy", + "sourceFileName": "/crawlee/_request.py" + }, + "354": { + "qualifiedName": "state", + "sourceFileName": "/crawlee/_request.py" + }, + "355": { + "qualifiedName": "session_rotation_count", + "sourceFileName": "/crawlee/_request.py" + }, + "356": { + "qualifiedName": "skip_navigation", + "sourceFileName": "/crawlee/_request.py" + }, + "357": { + "qualifiedName": "last_proxy_tier", + "sourceFileName": "/crawlee/_request.py" + }, + "358": { + "qualifiedName": "forefront", + "sourceFileName": "/crawlee/_request.py" + }, + "359": { + "qualifiedName": "crawl_depth", + "sourceFileName": "/crawlee/_request.py" + }, + "360": { + "qualifiedName": "session_id", + "sourceFileName": "/crawlee/_request.py" + }, + "361": { + "qualifiedName": "UserData", + "sourceFileName": "/crawlee/_request.py" + }, + "362": { + "qualifiedName": "model_config", + "sourceFileName": "/crawlee/_request.py" + }, + "363": { + "qualifiedName": "__pydantic_extra__", + "sourceFileName": "/crawlee/_request.py" + }, + "364": { + "qualifiedName": "crawlee_data", + "sourceFileName": "/crawlee/_request.py" + }, + "365": { + "qualifiedName": "label", + "sourceFileName": "/crawlee/_request.py" + }, + "366": { + "qualifiedName": "__getitem__", + "sourceFileName": "/crawlee/_request.py" + }, + "369": { + "qualifiedName": "__setitem__", + "sourceFileName": "/crawlee/_request.py" + }, + "373": { + "qualifiedName": "__delitem__", + "sourceFileName": "/crawlee/_request.py" + }, + "376": { + "qualifiedName": "__iter__", + "sourceFileName": "/crawlee/_request.py" + }, + "378": { + "qualifiedName": "__len__", + "sourceFileName": "/crawlee/_request.py" + }, + "380": { + "qualifiedName": "__eq__", + "sourceFileName": "/crawlee/_request.py" + }, + "383": { + "qualifiedName": "user_data_adapter", + "sourceFileName": "/crawlee/_request.py" + }, + "384": { + "qualifiedName": "RequestOptions", + "sourceFileName": "/crawlee/_request.py" + }, + "385": { + "qualifiedName": "url", + "sourceFileName": "/crawlee/_request.py" + }, + "386": { + "qualifiedName": "method", + "sourceFileName": "/crawlee/_request.py" + }, + "387": { + "qualifiedName": "headers", + "sourceFileName": "/crawlee/_request.py" + }, + "388": { + "qualifiedName": "payload", + "sourceFileName": "/crawlee/_request.py" + }, + "389": { + "qualifiedName": "label", + "sourceFileName": "/crawlee/_request.py" + }, + "390": { + "qualifiedName": "session_id", + "sourceFileName": "/crawlee/_request.py" + }, + "391": { + "qualifiedName": "unique_key", + "sourceFileName": "/crawlee/_request.py" + }, + "392": { + "qualifiedName": "id", + "sourceFileName": "/crawlee/_request.py" + }, + "393": { + "qualifiedName": "keep_url_fragment", + "sourceFileName": "/crawlee/_request.py" + }, + "394": { + "qualifiedName": "use_extended_unique_key", + "sourceFileName": "/crawlee/_request.py" + }, + "395": { + "qualifiedName": "always_enqueue", + "sourceFileName": "/crawlee/_request.py" + }, + "396": { + "qualifiedName": "user_data", + "sourceFileName": "/crawlee/_request.py" + }, + "397": { + "qualifiedName": "no_retry", + "sourceFileName": "/crawlee/_request.py" + }, + "398": { + "qualifiedName": "Request", + "sourceFileName": "/crawlee/_request.py" + }, + "399": { + "qualifiedName": "model_config", + "sourceFileName": "/crawlee/_request.py" + }, + "400": { + "qualifiedName": "url", + "sourceFileName": "/crawlee/_request.py" + }, + "401": { + "qualifiedName": "method", + "sourceFileName": "/crawlee/_request.py" + }, + "402": { + "qualifiedName": "headers", + "sourceFileName": "/crawlee/_request.py" + }, + "403": { + "qualifiedName": "payload", + "sourceFileName": "/crawlee/_request.py" + }, + "404": { + "qualifiedName": "user_data", + "sourceFileName": "/crawlee/_request.py" + }, + "405": { + "qualifiedName": "retry_count", + "sourceFileName": "/crawlee/_request.py" + }, + "406": { + "qualifiedName": "no_retry", + "sourceFileName": "/crawlee/_request.py" + }, + "407": { + "qualifiedName": "loaded_url", + "sourceFileName": "/crawlee/_request.py" + }, + "408": { + "qualifiedName": "handled_at", + "sourceFileName": "/crawlee/_request.py" + }, + "409": { + "qualifiedName": "unique_key", + "sourceFileName": "/crawlee/_request.py" + }, + "410": { + "qualifiedName": "id", + "sourceFileName": "/crawlee/_request.py" + }, + "411": { + "qualifiedName": "from_url", + "sourceFileName": "/crawlee/_request.py" + }, + "425": { + "qualifiedName": "get_query_param_from_url", + "sourceFileName": "/crawlee/_request.py" + }, + "429": { + "qualifiedName": "label", + "sourceFileName": "/crawlee/_request.py" + }, + "430": { + "qualifiedName": "session_id", + "sourceFileName": "/crawlee/_request.py" + }, + "431": { + "qualifiedName": "crawlee_data", + "sourceFileName": "/crawlee/_request.py" + }, + "432": { + "qualifiedName": "crawl_depth", + "sourceFileName": "/crawlee/_request.py" + }, + "433": { + "qualifiedName": "crawl_depth", + "sourceFileName": "/crawlee/_request.py" + }, + "436": { + "qualifiedName": "state", + "sourceFileName": "/crawlee/_request.py" + }, + "437": { + "qualifiedName": "state", + "sourceFileName": "/crawlee/_request.py" + }, + "440": { + "qualifiedName": "max_retries", + "sourceFileName": "/crawlee/_request.py" + }, + "441": { + "qualifiedName": "max_retries", + "sourceFileName": "/crawlee/_request.py" + }, + "444": { + "qualifiedName": "session_rotation_count", + "sourceFileName": "/crawlee/_request.py" + }, + "445": { + "qualifiedName": "session_rotation_count", + "sourceFileName": "/crawlee/_request.py" + }, + "448": { + "qualifiedName": "enqueue_strategy", + "sourceFileName": "/crawlee/_request.py" + }, + "449": { + "qualifiedName": "enqueue_strategy", + "sourceFileName": "/crawlee/_request.py" + }, + "452": { + "qualifiedName": "last_proxy_tier", + "sourceFileName": "/crawlee/_request.py" + }, + "453": { + "qualifiedName": "last_proxy_tier", + "sourceFileName": "/crawlee/_request.py" + }, + "456": { + "qualifiedName": "forefront", + "sourceFileName": "/crawlee/_request.py" + }, + "457": { + "qualifiedName": "forefront", + "sourceFileName": "/crawlee/_request.py" + }, + "460": { + "qualifiedName": "RequestWithLock", + "sourceFileName": "/crawlee/_request.py" + }, + "461": { + "qualifiedName": "lock_expires_at", + "sourceFileName": "/crawlee/_request.py" + }, + "462": { + "qualifiedName": "get_configured_log_level", + "sourceFileName": "/crawlee/_log_config.py" + }, + "464": { + "qualifiedName": "configure_logger", + "sourceFileName": "/crawlee/_log_config.py" + }, + "468": { + "qualifiedName": "CrawleeLogFormatter", + "sourceFileName": "/crawlee/_log_config.py" + }, + "469": { + "qualifiedName": "empty_record", + "sourceFileName": "/crawlee/_log_config.py" + }, + "470": { + "qualifiedName": "__init__", + "sourceFileName": "/crawlee/_log_config.py" + }, + "475": { + "qualifiedName": "format", + "sourceFileName": "/crawlee/_log_config.py" + }, + "478": { + "qualifiedName": "METADATA_FILENAME", + "sourceFileName": "/crawlee/_consts.py" + }, + "479": { + "qualifiedName": "cli", + "sourceFileName": "/crawlee/_cli.py" + }, + "480": { + "qualifiedName": "template_directory", + "sourceFileName": "/crawlee/_cli.py" + }, + "481": { + "qualifiedName": "crawler_choices", + "sourceFileName": "/crawlee/_cli.py" + }, + "482": { + "qualifiedName": "http_client_choices", + "sourceFileName": "/crawlee/_cli.py" + }, + "483": { + "qualifiedName": "package_manager_choices", + "sourceFileName": "/crawlee/_cli.py" + }, + "484": { + "qualifiedName": "default_start_url", + "sourceFileName": "/crawlee/_cli.py" + }, + "485": { + "qualifiedName": "callback", + "sourceFileName": "/crawlee/_cli.py" + }, + "488": { + "qualifiedName": "create", + "sourceFileName": "/crawlee/_cli.py" + }, + "496": { + "qualifiedName": "patch_browserforge", + "sourceFileName": "/crawlee/_browserforge_workaround.py" + }, + "498": { + "qualifiedName": "__version__", + "sourceFileName": "/crawlee/__init__.py" + }, + "499": { + "qualifiedName": "logger", + "sourceFileName": "/crawlee/storages/_request_queue.py" + }, + "500": { + "qualifiedName": "T", + "sourceFileName": "/crawlee/storages/_request_queue.py" + }, + "501": { + "qualifiedName": "CachedRequest", + "sourceFileName": "/crawlee/storages/_request_queue.py" + }, + "502": { + "qualifiedName": "id", + "sourceFileName": "/crawlee/storages/_request_queue.py" + }, + "503": { + "qualifiedName": "was_already_handled", + "sourceFileName": "/crawlee/storages/_request_queue.py" + }, + "504": { + "qualifiedName": "hydrated", + "sourceFileName": "/crawlee/storages/_request_queue.py" + }, + "505": { + "qualifiedName": "lock_expires_at", + "sourceFileName": "/crawlee/storages/_request_queue.py" + }, + "506": { + "qualifiedName": "forefront", + "sourceFileName": "/crawlee/storages/_request_queue.py" + }, + "507": { + "qualifiedName": "RequestQueue", + "sourceFileName": "/crawlee/storages/_request_queue.py" + }, + "508": { + "qualifiedName": "__init__", + "sourceFileName": "/crawlee/storages/_request_queue.py" + }, + "513": { + "qualifiedName": "from_storage_object", + "sourceFileName": "/crawlee/storages/_request_queue.py" + }, + "517": { + "qualifiedName": "id", + "sourceFileName": "/crawlee/storages/_request_queue.py" + }, + "518": { + "qualifiedName": "name", + "sourceFileName": "/crawlee/storages/_request_queue.py" + }, + "519": { + "qualifiedName": "storage_object", + "sourceFileName": "/crawlee/storages/_request_queue.py" + }, + "520": { + "qualifiedName": "storage_object", + "sourceFileName": "/crawlee/storages/_request_queue.py" + }, + "523": { + "qualifiedName": "open", + "sourceFileName": "/crawlee/storages/_request_queue.py" + }, + "529": { + "qualifiedName": "drop", + "sourceFileName": "/crawlee/storages/_request_queue.py" + }, + "532": { + "qualifiedName": "add_request", + "sourceFileName": "/crawlee/storages/_request_queue.py" + }, + "536": { + "qualifiedName": "add_requests_batched", + "sourceFileName": "/crawlee/storages/_request_queue.py" + }, + "543": { + "qualifiedName": "get_request", + "sourceFileName": "/crawlee/storages/_request_queue.py" + }, + "546": { + "qualifiedName": "fetch_next_request", + "sourceFileName": "/crawlee/storages/_request_queue.py" + }, + "548": { + "qualifiedName": "mark_request_as_handled", + "sourceFileName": "/crawlee/storages/_request_queue.py" + }, + "551": { + "qualifiedName": "reclaim_request", + "sourceFileName": "/crawlee/storages/_request_queue.py" + }, + "555": { + "qualifiedName": "is_empty", + "sourceFileName": "/crawlee/storages/_request_queue.py" + }, + "557": { + "qualifiedName": "is_finished", + "sourceFileName": "/crawlee/storages/_request_queue.py" + }, + "559": { + "qualifiedName": "get_info", + "sourceFileName": "/crawlee/storages/_request_queue.py" + }, + "561": { + "qualifiedName": "get_handled_count", + "sourceFileName": "/crawlee/storages/_request_queue.py" + }, + "563": { + "qualifiedName": "get_total_count", + "sourceFileName": "/crawlee/storages/_request_queue.py" + }, + "565": { + "qualifiedName": "T", + "sourceFileName": "/crawlee/storages/_key_value_store.py" + }, + "566": { + "qualifiedName": "logger", + "sourceFileName": "/crawlee/storages/_key_value_store.py" + }, + "567": { + "qualifiedName": "AutosavedValue", + "sourceFileName": "/crawlee/storages/_key_value_store.py" + }, + "568": { + "qualifiedName": "root", + "sourceFileName": "/crawlee/storages/_key_value_store.py" + }, + "569": { + "qualifiedName": "KeyValueStore", + "sourceFileName": "/crawlee/storages/_key_value_store.py" + }, + "570": { + "qualifiedName": "__init__", + "sourceFileName": "/crawlee/storages/_key_value_store.py" + }, + "575": { + "qualifiedName": "from_storage_object", + "sourceFileName": "/crawlee/storages/_key_value_store.py" + }, + "579": { + "qualifiedName": "id", + "sourceFileName": "/crawlee/storages/_key_value_store.py" + }, + "580": { + "qualifiedName": "name", + "sourceFileName": "/crawlee/storages/_key_value_store.py" + }, + "581": { + "qualifiedName": "storage_object", + "sourceFileName": "/crawlee/storages/_key_value_store.py" + }, + "582": { + "qualifiedName": "storage_object", + "sourceFileName": "/crawlee/storages/_key_value_store.py" + }, + "585": { + "qualifiedName": "get_info", + "sourceFileName": "/crawlee/storages/_key_value_store.py" + }, + "587": { + "qualifiedName": "open", + "sourceFileName": "/crawlee/storages/_key_value_store.py" + }, + "593": { + "qualifiedName": "drop", + "sourceFileName": "/crawlee/storages/_key_value_store.py" + }, + "595": { + "qualifiedName": "get_value", + "sourceFileName": "/crawlee/storages/_key_value_store.py" + }, + "599": { + "qualifiedName": "iterate_keys", + "sourceFileName": "/crawlee/storages/_key_value_store.py" + }, + "602": { + "qualifiedName": "set_value", + "sourceFileName": "/crawlee/storages/_key_value_store.py" + }, + "607": { + "qualifiedName": "get_public_url", + "sourceFileName": "/crawlee/storages/_key_value_store.py" + }, + "610": { + "qualifiedName": "get_auto_saved_value", + "sourceFileName": "/crawlee/storages/_key_value_store.py" + }, + "614": { + "qualifiedName": "persist_autosaved_values", + "sourceFileName": "/crawlee/storages/_key_value_store.py" + }, + "624": { + "qualifiedName": "logger", + "sourceFileName": "/crawlee/storages/_dataset.py" + }, + "625": { + "qualifiedName": "GetDataKwargs", + "sourceFileName": "/crawlee/storages/_dataset.py" + }, + "626": { + "qualifiedName": "offset", + "sourceFileName": "/crawlee/storages/_dataset.py" + }, + "627": { + "qualifiedName": "limit", + "sourceFileName": "/crawlee/storages/_dataset.py" + }, + "628": { + "qualifiedName": "clean", + "sourceFileName": "/crawlee/storages/_dataset.py" + }, + "629": { + "qualifiedName": "desc", + "sourceFileName": "/crawlee/storages/_dataset.py" + }, + "630": { + "qualifiedName": "fields", + "sourceFileName": "/crawlee/storages/_dataset.py" + }, + "631": { + "qualifiedName": "omit", + "sourceFileName": "/crawlee/storages/_dataset.py" + }, + "632": { + "qualifiedName": "unwind", + "sourceFileName": "/crawlee/storages/_dataset.py" + }, + "633": { + "qualifiedName": "skip_empty", + "sourceFileName": "/crawlee/storages/_dataset.py" + }, + "634": { + "qualifiedName": "skip_hidden", + "sourceFileName": "/crawlee/storages/_dataset.py" + }, + "635": { + "qualifiedName": "flatten", + "sourceFileName": "/crawlee/storages/_dataset.py" + }, + "636": { + "qualifiedName": "view", + "sourceFileName": "/crawlee/storages/_dataset.py" + }, + "637": { + "qualifiedName": "ExportToKwargs", + "sourceFileName": "/crawlee/storages/_dataset.py" + }, + "638": { + "qualifiedName": "key", + "sourceFileName": "/crawlee/storages/_dataset.py" + }, + "639": { + "qualifiedName": "content_type", + "sourceFileName": "/crawlee/storages/_dataset.py" + }, + "640": { + "qualifiedName": "to_key_value_store_id", + "sourceFileName": "/crawlee/storages/_dataset.py" + }, + "641": { + "qualifiedName": "to_key_value_store_name", + "sourceFileName": "/crawlee/storages/_dataset.py" + }, + "642": { + "qualifiedName": "ExportDataJsonKwargs", + "sourceFileName": "/crawlee/storages/_dataset.py" + }, + "643": { + "qualifiedName": "skipkeys", + "sourceFileName": "/crawlee/storages/_dataset.py" + }, + "644": { + "qualifiedName": "ensure_ascii", + "sourceFileName": "/crawlee/storages/_dataset.py" + }, + "645": { + "qualifiedName": "check_circular", + "sourceFileName": "/crawlee/storages/_dataset.py" + }, + "646": { + "qualifiedName": "allow_nan", + "sourceFileName": "/crawlee/storages/_dataset.py" + }, + "647": { + "qualifiedName": "cls", + "sourceFileName": "/crawlee/storages/_dataset.py" + }, + "648": { + "qualifiedName": "indent", + "sourceFileName": "/crawlee/storages/_dataset.py" + }, + "649": { + "qualifiedName": "separators", + "sourceFileName": "/crawlee/storages/_dataset.py" + }, + "650": { + "qualifiedName": "default", + "sourceFileName": "/crawlee/storages/_dataset.py" + }, + "651": { + "qualifiedName": "sort_keys", + "sourceFileName": "/crawlee/storages/_dataset.py" + }, + "652": { + "qualifiedName": "ExportDataCsvKwargs", + "sourceFileName": "/crawlee/storages/_dataset.py" + }, + "653": { + "qualifiedName": "dialect", + "sourceFileName": "/crawlee/storages/_dataset.py" + }, + "654": { + "qualifiedName": "delimiter", + "sourceFileName": "/crawlee/storages/_dataset.py" + }, + "655": { + "qualifiedName": "doublequote", + "sourceFileName": "/crawlee/storages/_dataset.py" + }, + "656": { + "qualifiedName": "escapechar", + "sourceFileName": "/crawlee/storages/_dataset.py" + }, + "657": { + "qualifiedName": "lineterminator", + "sourceFileName": "/crawlee/storages/_dataset.py" + }, + "658": { + "qualifiedName": "quotechar", + "sourceFileName": "/crawlee/storages/_dataset.py" + }, + "659": { + "qualifiedName": "quoting", + "sourceFileName": "/crawlee/storages/_dataset.py" + }, + "660": { + "qualifiedName": "skipinitialspace", + "sourceFileName": "/crawlee/storages/_dataset.py" + }, + "661": { + "qualifiedName": "strict", + "sourceFileName": "/crawlee/storages/_dataset.py" + }, + "662": { + "qualifiedName": "Dataset", + "sourceFileName": "/crawlee/storages/_dataset.py" + }, + "663": { + "qualifiedName": "__init__", + "sourceFileName": "/crawlee/storages/_dataset.py" + }, + "668": { + "qualifiedName": "from_storage_object", + "sourceFileName": "/crawlee/storages/_dataset.py" + }, + "672": { + "qualifiedName": "id", + "sourceFileName": "/crawlee/storages/_dataset.py" + }, + "673": { + "qualifiedName": "name", + "sourceFileName": "/crawlee/storages/_dataset.py" + }, + "674": { + "qualifiedName": "storage_object", + "sourceFileName": "/crawlee/storages/_dataset.py" + }, + "675": { + "qualifiedName": "storage_object", + "sourceFileName": "/crawlee/storages/_dataset.py" + }, + "678": { + "qualifiedName": "open", + "sourceFileName": "/crawlee/storages/_dataset.py" + }, + "684": { + "qualifiedName": "drop", + "sourceFileName": "/crawlee/storages/_dataset.py" + }, + "686": { + "qualifiedName": "push_data", + "sourceFileName": "/crawlee/storages/_dataset.py" + }, + "690": { + "qualifiedName": "get_data", + "sourceFileName": "/crawlee/storages/_dataset.py" + }, + "693": { + "qualifiedName": "write_to_csv", + "sourceFileName": "/crawlee/storages/_dataset.py" + }, + "697": { + "qualifiedName": "write_to_json", + "sourceFileName": "/crawlee/storages/_dataset.py" + }, + "701": { + "qualifiedName": "export_to", + "sourceFileName": "/crawlee/storages/_dataset.py" + }, + "704": { + "qualifiedName": "get_info", + "sourceFileName": "/crawlee/storages/_dataset.py" + }, + "706": { + "qualifiedName": "iterate_items", + "sourceFileName": "/crawlee/storages/_dataset.py" + }, + "717": { + "qualifiedName": "check_and_serialize", + "sourceFileName": "/crawlee/storages/_dataset.py" + }, + "721": { + "qualifiedName": "TResource", + "sourceFileName": "/crawlee/storages/_creation_management.py" + }, + "722": { + "qualifiedName": "open_storage", + "sourceFileName": "/crawlee/storages/_creation_management.py" + }, + "729": { + "qualifiedName": "remove_storage_from_cache", + "sourceFileName": "/crawlee/storages/_creation_management.py" + }, + "734": { + "qualifiedName": "Storage", + "sourceFileName": "/crawlee/storages/_base.py" + }, + "735": { + "qualifiedName": "id", + "sourceFileName": "/crawlee/storages/_base.py" + }, + "736": { + "qualifiedName": "name", + "sourceFileName": "/crawlee/storages/_base.py" + }, + "737": { + "qualifiedName": "storage_object", + "sourceFileName": "/crawlee/storages/_base.py" + }, + "738": { + "qualifiedName": "storage_object", + "sourceFileName": "/crawlee/storages/_base.py" + }, + "741": { + "qualifiedName": "open", + "sourceFileName": "/crawlee/storages/_base.py" + }, + "747": { + "qualifiedName": "drop", + "sourceFileName": "/crawlee/storages/_base.py" + }, + "749": { + "qualifiedName": "KvsValueType", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "750": { + "qualifiedName": "StorageMetadata", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "751": { + "qualifiedName": "model_config", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "752": { + "qualifiedName": "id", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "753": { + "qualifiedName": "name", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "754": { + "qualifiedName": "accessed_at", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "755": { + "qualifiedName": "created_at", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "756": { + "qualifiedName": "modified_at", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "757": { + "qualifiedName": "DatasetMetadata", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "758": { + "qualifiedName": "model_config", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "759": { + "qualifiedName": "item_count", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "760": { + "qualifiedName": "KeyValueStoreMetadata", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "761": { + "qualifiedName": "model_config", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "762": { + "qualifiedName": "user_id", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "763": { + "qualifiedName": "RequestQueueMetadata", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "764": { + "qualifiedName": "model_config", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "765": { + "qualifiedName": "had_multiple_clients", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "766": { + "qualifiedName": "handled_request_count", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "767": { + "qualifiedName": "pending_request_count", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "768": { + "qualifiedName": "stats", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "769": { + "qualifiedName": "total_request_count", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "770": { + "qualifiedName": "user_id", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "771": { + "qualifiedName": "resource_directory", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "772": { + "qualifiedName": "KeyValueStoreRecord", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "773": { + "qualifiedName": "model_config", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "774": { + "qualifiedName": "key", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "775": { + "qualifiedName": "value", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "776": { + "qualifiedName": "content_type", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "777": { + "qualifiedName": "filename", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "778": { + "qualifiedName": "KeyValueStoreRecordMetadata", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "779": { + "qualifiedName": "model_config", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "780": { + "qualifiedName": "key", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "781": { + "qualifiedName": "content_type", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "782": { + "qualifiedName": "KeyValueStoreKeyInfo", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "783": { + "qualifiedName": "model_config", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "784": { + "qualifiedName": "key", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "785": { + "qualifiedName": "size", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "786": { + "qualifiedName": "KeyValueStoreListKeysPage", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "787": { + "qualifiedName": "model_config", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "788": { + "qualifiedName": "count", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "789": { + "qualifiedName": "limit", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "790": { + "qualifiedName": "is_truncated", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "791": { + "qualifiedName": "items", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "792": { + "qualifiedName": "exclusive_start_key", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "793": { + "qualifiedName": "next_exclusive_start_key", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "794": { + "qualifiedName": "RequestQueueHeadState", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "795": { + "qualifiedName": "model_config", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "796": { + "qualifiedName": "was_limit_reached", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "797": { + "qualifiedName": "prev_limit", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "798": { + "qualifiedName": "queue_modified_at", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "799": { + "qualifiedName": "query_started_at", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "800": { + "qualifiedName": "had_multiple_clients", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "801": { + "qualifiedName": "RequestQueueHead", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "802": { + "qualifiedName": "model_config", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "803": { + "qualifiedName": "limit", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "804": { + "qualifiedName": "had_multiple_clients", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "805": { + "qualifiedName": "queue_modified_at", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "806": { + "qualifiedName": "items", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "807": { + "qualifiedName": "RequestQueueHeadWithLocks", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "808": { + "qualifiedName": "lock_secs", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "809": { + "qualifiedName": "queue_has_locked_requests", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "810": { + "qualifiedName": "_ListPage", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "811": { + "qualifiedName": "model_config", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "812": { + "qualifiedName": "count", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "813": { + "qualifiedName": "offset", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "814": { + "qualifiedName": "limit", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "815": { + "qualifiedName": "total", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "816": { + "qualifiedName": "desc", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "817": { + "qualifiedName": "DatasetListPage", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "818": { + "qualifiedName": "items", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "819": { + "qualifiedName": "KeyValueStoreListPage", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "820": { + "qualifiedName": "items", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "821": { + "qualifiedName": "RequestQueueListPage", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "822": { + "qualifiedName": "items", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "823": { + "qualifiedName": "DatasetItemsListPage", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "824": { + "qualifiedName": "items", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "825": { + "qualifiedName": "ProlongRequestLockResponse", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "826": { + "qualifiedName": "model_config", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "827": { + "qualifiedName": "lock_expires_at", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "828": { + "qualifiedName": "ProcessedRequest", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "829": { + "qualifiedName": "model_config", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "830": { + "qualifiedName": "id", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "831": { + "qualifiedName": "unique_key", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "832": { + "qualifiedName": "was_already_present", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "833": { + "qualifiedName": "was_already_handled", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "834": { + "qualifiedName": "UnprocessedRequest", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "835": { + "qualifiedName": "model_config", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "836": { + "qualifiedName": "unique_key", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "837": { + "qualifiedName": "url", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "838": { + "qualifiedName": "method", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "839": { + "qualifiedName": "BatchRequestsOperationResponse", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "840": { + "qualifiedName": "model_config", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "841": { + "qualifiedName": "processed_requests", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "842": { + "qualifiedName": "unprocessed_requests", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "843": { + "qualifiedName": "InternalRequest", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "844": { + "qualifiedName": "model_config", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "845": { + "qualifiedName": "id", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "846": { + "qualifiedName": "unique_key", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "847": { + "qualifiedName": "order_no", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "848": { + "qualifiedName": "handled_at", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "849": { + "qualifiedName": "request", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "850": { + "qualifiedName": "from_request", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "855": { + "qualifiedName": "to_request", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "857": { + "qualifiedName": "RequestQueueCollectionClient", + "sourceFileName": "/crawlee/storage_clients/_memory/_request_queue_collection_client.py" + }, + "858": { + "qualifiedName": "__init__", + "sourceFileName": "/crawlee/storage_clients/_memory/_request_queue_collection_client.py" + }, + "861": { + "qualifiedName": "get_or_create", + "sourceFileName": "/crawlee/storage_clients/_memory/_request_queue_collection_client.py" + }, + "866": { + "qualifiedName": "list", + "sourceFileName": "/crawlee/storage_clients/_memory/_request_queue_collection_client.py" + }, + "872": { + "qualifiedName": "logger", + "sourceFileName": "/crawlee/storage_clients/_memory/_request_queue_client.py" + }, + "873": { + "qualifiedName": "RequestQueueClient", + "sourceFileName": "/crawlee/storage_clients/_memory/_request_queue_client.py" + }, + "874": { + "qualifiedName": "__init__", + "sourceFileName": "/crawlee/storage_clients/_memory/_request_queue_client.py" + }, + "884": { + "qualifiedName": "resource_info", + "sourceFileName": "/crawlee/storage_clients/_memory/_request_queue_client.py" + }, + "885": { + "qualifiedName": "resource_directory", + "sourceFileName": "/crawlee/storage_clients/_memory/_request_queue_client.py" + }, + "886": { + "qualifiedName": "get", + "sourceFileName": "/crawlee/storage_clients/_memory/_request_queue_client.py" + }, + "888": { + "qualifiedName": "update", + "sourceFileName": "/crawlee/storage_clients/_memory/_request_queue_client.py" + }, + "891": { + "qualifiedName": "delete", + "sourceFileName": "/crawlee/storage_clients/_memory/_request_queue_client.py" + }, + "893": { + "qualifiedName": "list_head", + "sourceFileName": "/crawlee/storage_clients/_memory/_request_queue_client.py" + }, + "897": { + "qualifiedName": "list_and_lock_head", + "sourceFileName": "/crawlee/storage_clients/_memory/_request_queue_client.py" + }, + "901": { + "qualifiedName": "add_request", + "sourceFileName": "/crawlee/storage_clients/_memory/_request_queue_client.py" + }, + "905": { + "qualifiedName": "get_request", + "sourceFileName": "/crawlee/storage_clients/_memory/_request_queue_client.py" + }, + "908": { + "qualifiedName": "update_request", + "sourceFileName": "/crawlee/storage_clients/_memory/_request_queue_client.py" + }, + "912": { + "qualifiedName": "delete_request", + "sourceFileName": "/crawlee/storage_clients/_memory/_request_queue_client.py" + }, + "915": { + "qualifiedName": "prolong_request_lock", + "sourceFileName": "/crawlee/storage_clients/_memory/_request_queue_client.py" + }, + "920": { + "qualifiedName": "delete_request_lock", + "sourceFileName": "/crawlee/storage_clients/_memory/_request_queue_client.py" + }, + "924": { + "qualifiedName": "batch_add_requests", + "sourceFileName": "/crawlee/storage_clients/_memory/_request_queue_client.py" + }, + "928": { + "qualifiedName": "batch_delete_requests", + "sourceFileName": "/crawlee/storage_clients/_memory/_request_queue_client.py" + }, + "931": { + "qualifiedName": "update_timestamps", + "sourceFileName": "/crawlee/storage_clients/_memory/_request_queue_client.py" + }, + "934": { + "qualifiedName": "TResourceClient", + "sourceFileName": "/crawlee/storage_clients/_memory/_memory_storage_client.py" + }, + "935": { + "qualifiedName": "logger", + "sourceFileName": "/crawlee/storage_clients/_memory/_memory_storage_client.py" + }, + "936": { + "qualifiedName": "MemoryStorageClient", + "sourceFileName": "/crawlee/storage_clients/_memory/_memory_storage_client.py" + }, + "937": { + "qualifiedName": "__init__", + "sourceFileName": "/crawlee/storage_clients/_memory/_memory_storage_client.py" + }, + "945": { + "qualifiedName": "from_config", + "sourceFileName": "/crawlee/storage_clients/_memory/_memory_storage_client.py" + }, + "948": { + "qualifiedName": "write_metadata", + "sourceFileName": "/crawlee/storage_clients/_memory/_memory_storage_client.py" + }, + "949": { + "qualifiedName": "persist_storage", + "sourceFileName": "/crawlee/storage_clients/_memory/_memory_storage_client.py" + }, + "950": { + "qualifiedName": "storage_dir", + "sourceFileName": "/crawlee/storage_clients/_memory/_memory_storage_client.py" + }, + "951": { + "qualifiedName": "datasets_directory", + "sourceFileName": "/crawlee/storage_clients/_memory/_memory_storage_client.py" + }, + "952": { + "qualifiedName": "key_value_stores_directory", + "sourceFileName": "/crawlee/storage_clients/_memory/_memory_storage_client.py" + }, + "953": { + "qualifiedName": "request_queues_directory", + "sourceFileName": "/crawlee/storage_clients/_memory/_memory_storage_client.py" + }, + "954": { + "qualifiedName": "dataset", + "sourceFileName": "/crawlee/storage_clients/_memory/_memory_storage_client.py" + }, + "957": { + "qualifiedName": "datasets", + "sourceFileName": "/crawlee/storage_clients/_memory/_memory_storage_client.py" + }, + "959": { + "qualifiedName": "key_value_store", + "sourceFileName": "/crawlee/storage_clients/_memory/_memory_storage_client.py" + }, + "962": { + "qualifiedName": "key_value_stores", + "sourceFileName": "/crawlee/storage_clients/_memory/_memory_storage_client.py" + }, + "964": { + "qualifiedName": "request_queue", + "sourceFileName": "/crawlee/storage_clients/_memory/_memory_storage_client.py" + }, + "967": { + "qualifiedName": "request_queues", + "sourceFileName": "/crawlee/storage_clients/_memory/_memory_storage_client.py" + }, + "969": { + "qualifiedName": "purge_on_start", + "sourceFileName": "/crawlee/storage_clients/_memory/_memory_storage_client.py" + }, + "971": { + "qualifiedName": "get_cached_resource_client", + "sourceFileName": "/crawlee/storage_clients/_memory/_memory_storage_client.py" + }, + "976": { + "qualifiedName": "add_resource_client_to_cache", + "sourceFileName": "/crawlee/storage_clients/_memory/_memory_storage_client.py" + }, + "979": { + "qualifiedName": "KeyValueStoreCollectionClient", + "sourceFileName": "/crawlee/storage_clients/_memory/_key_value_store_collection_client.py" + }, + "980": { + "qualifiedName": "__init__", + "sourceFileName": "/crawlee/storage_clients/_memory/_key_value_store_collection_client.py" + }, + "983": { + "qualifiedName": "get_or_create", + "sourceFileName": "/crawlee/storage_clients/_memory/_key_value_store_collection_client.py" + }, + "988": { + "qualifiedName": "list", + "sourceFileName": "/crawlee/storage_clients/_memory/_key_value_store_collection_client.py" + }, + "994": { + "qualifiedName": "logger", + "sourceFileName": "/crawlee/storage_clients/_memory/_key_value_store_client.py" + }, + "995": { + "qualifiedName": "KeyValueStoreClient", + "sourceFileName": "/crawlee/storage_clients/_memory/_key_value_store_client.py" + }, + "996": { + "qualifiedName": "__init__", + "sourceFileName": "/crawlee/storage_clients/_memory/_key_value_store_client.py" + }, + "1004": { + "qualifiedName": "resource_info", + "sourceFileName": "/crawlee/storage_clients/_memory/_key_value_store_client.py" + }, + "1005": { + "qualifiedName": "resource_directory", + "sourceFileName": "/crawlee/storage_clients/_memory/_key_value_store_client.py" + }, + "1006": { + "qualifiedName": "get", + "sourceFileName": "/crawlee/storage_clients/_memory/_key_value_store_client.py" + }, + "1008": { + "qualifiedName": "update", + "sourceFileName": "/crawlee/storage_clients/_memory/_key_value_store_client.py" + }, + "1011": { + "qualifiedName": "delete", + "sourceFileName": "/crawlee/storage_clients/_memory/_key_value_store_client.py" + }, + "1013": { + "qualifiedName": "list_keys", + "sourceFileName": "/crawlee/storage_clients/_memory/_key_value_store_client.py" + }, + "1017": { + "qualifiedName": "get_record", + "sourceFileName": "/crawlee/storage_clients/_memory/_key_value_store_client.py" + }, + "1020": { + "qualifiedName": "get_record_as_bytes", + "sourceFileName": "/crawlee/storage_clients/_memory/_key_value_store_client.py" + }, + "1023": { + "qualifiedName": "stream_record", + "sourceFileName": "/crawlee/storage_clients/_memory/_key_value_store_client.py" + }, + "1026": { + "qualifiedName": "set_record", + "sourceFileName": "/crawlee/storage_clients/_memory/_key_value_store_client.py" + }, + "1031": { + "qualifiedName": "delete_record", + "sourceFileName": "/crawlee/storage_clients/_memory/_key_value_store_client.py" + }, + "1034": { + "qualifiedName": "get_public_url", + "sourceFileName": "/crawlee/storage_clients/_memory/_key_value_store_client.py" + }, + "1037": { + "qualifiedName": "persist_record", + "sourceFileName": "/crawlee/storage_clients/_memory/_key_value_store_client.py" + }, + "1040": { + "qualifiedName": "delete_persisted_record", + "sourceFileName": "/crawlee/storage_clients/_memory/_key_value_store_client.py" + }, + "1043": { + "qualifiedName": "update_timestamps", + "sourceFileName": "/crawlee/storage_clients/_memory/_key_value_store_client.py" + }, + "1046": { + "qualifiedName": "DatasetCollectionClient", + "sourceFileName": "/crawlee/storage_clients/_memory/_dataset_collection_client.py" + }, + "1047": { + "qualifiedName": "__init__", + "sourceFileName": "/crawlee/storage_clients/_memory/_dataset_collection_client.py" + }, + "1050": { + "qualifiedName": "get_or_create", + "sourceFileName": "/crawlee/storage_clients/_memory/_dataset_collection_client.py" + }, + "1055": { + "qualifiedName": "list", + "sourceFileName": "/crawlee/storage_clients/_memory/_dataset_collection_client.py" + }, + "1061": { + "qualifiedName": "logger", + "sourceFileName": "/crawlee/storage_clients/_memory/_dataset_client.py" + }, + "1062": { + "qualifiedName": "DatasetClient", + "sourceFileName": "/crawlee/storage_clients/_memory/_dataset_client.py" + }, + "1063": { + "qualifiedName": "__init__", + "sourceFileName": "/crawlee/storage_clients/_memory/_dataset_client.py" + }, + "1072": { + "qualifiedName": "resource_info", + "sourceFileName": "/crawlee/storage_clients/_memory/_dataset_client.py" + }, + "1073": { + "qualifiedName": "resource_directory", + "sourceFileName": "/crawlee/storage_clients/_memory/_dataset_client.py" + }, + "1074": { + "qualifiedName": "get", + "sourceFileName": "/crawlee/storage_clients/_memory/_dataset_client.py" + }, + "1076": { + "qualifiedName": "update", + "sourceFileName": "/crawlee/storage_clients/_memory/_dataset_client.py" + }, + "1079": { + "qualifiedName": "delete", + "sourceFileName": "/crawlee/storage_clients/_memory/_dataset_client.py" + }, + "1081": { + "qualifiedName": "list_items", + "sourceFileName": "/crawlee/storage_clients/_memory/_dataset_client.py" + }, + "1094": { + "qualifiedName": "iterate_items", + "sourceFileName": "/crawlee/storage_clients/_memory/_dataset_client.py" + }, + "1105": { + "qualifiedName": "get_items_as_bytes", + "sourceFileName": "/crawlee/storage_clients/_memory/_dataset_client.py" + }, + "1123": { + "qualifiedName": "stream_items", + "sourceFileName": "/crawlee/storage_clients/_memory/_dataset_client.py" + }, + "1140": { + "qualifiedName": "push_items", + "sourceFileName": "/crawlee/storage_clients/_memory/_dataset_client.py" + }, + "1143": { + "qualifiedName": "update_timestamps", + "sourceFileName": "/crawlee/storage_clients/_memory/_dataset_client.py" + }, + "1146": { + "qualifiedName": "get_start_and_end_indexes", + "sourceFileName": "/crawlee/storage_clients/_memory/_dataset_client.py" + }, + "1150": { + "qualifiedName": "logger", + "sourceFileName": "/crawlee/storage_clients/_memory/_creation_management.py" + }, + "1151": { + "qualifiedName": "persist_metadata_if_enabled", + "sourceFileName": "/crawlee/storage_clients/_memory/_creation_management.py" + }, + "1156": { + "qualifiedName": "find_or_create_client_by_id_or_name_inner", + "sourceFileName": "/crawlee/storage_clients/_memory/_creation_management.py" + }, + "1162": { + "qualifiedName": "get_or_create_inner", + "sourceFileName": "/crawlee/storage_clients/_memory/_creation_management.py" + }, + "1169": { + "qualifiedName": "create_dataset_from_directory", + "sourceFileName": "/crawlee/storage_clients/_memory/_creation_management.py" + }, + "1175": { + "qualifiedName": "create_kvs_from_directory", + "sourceFileName": "/crawlee/storage_clients/_memory/_creation_management.py" + }, + "1181": { + "qualifiedName": "create_rq_from_directory", + "sourceFileName": "/crawlee/storage_clients/_memory/_creation_management.py" + }, + "1187": { + "qualifiedName": "ResourceClient", + "sourceFileName": "/crawlee/storage_clients/_base/_types.py" + }, + "1188": { + "qualifiedName": "ResourceCollectionClient", + "sourceFileName": "/crawlee/storage_clients/_base/_types.py" + }, + "1189": { + "qualifiedName": "StorageClient", + "sourceFileName": "/crawlee/storage_clients/_base/_storage_client.py" + }, + "1190": { + "qualifiedName": "dataset", + "sourceFileName": "/crawlee/storage_clients/_base/_storage_client.py" + }, + "1193": { + "qualifiedName": "datasets", + "sourceFileName": "/crawlee/storage_clients/_base/_storage_client.py" + }, + "1195": { + "qualifiedName": "key_value_store", + "sourceFileName": "/crawlee/storage_clients/_base/_storage_client.py" + }, + "1198": { + "qualifiedName": "key_value_stores", + "sourceFileName": "/crawlee/storage_clients/_base/_storage_client.py" + }, + "1200": { + "qualifiedName": "request_queue", + "sourceFileName": "/crawlee/storage_clients/_base/_storage_client.py" + }, + "1203": { + "qualifiedName": "request_queues", + "sourceFileName": "/crawlee/storage_clients/_base/_storage_client.py" + }, + "1205": { + "qualifiedName": "purge_on_start", + "sourceFileName": "/crawlee/storage_clients/_base/_storage_client.py" + }, + "1207": { + "qualifiedName": "get_rate_limit_errors", + "sourceFileName": "/crawlee/storage_clients/_base/_storage_client.py" + }, + "1209": { + "qualifiedName": "RequestQueueCollectionClient", + "sourceFileName": "/crawlee/storage_clients/_base/_request_queue_collection_client.py" + }, + "1210": { + "qualifiedName": "get_or_create", + "sourceFileName": "/crawlee/storage_clients/_base/_request_queue_collection_client.py" + }, + "1215": { + "qualifiedName": "list", + "sourceFileName": "/crawlee/storage_clients/_base/_request_queue_collection_client.py" + }, + "1221": { + "qualifiedName": "RequestQueueClient", + "sourceFileName": "/crawlee/storage_clients/_base/_request_queue_client.py" + }, + "1222": { + "qualifiedName": "get", + "sourceFileName": "/crawlee/storage_clients/_base/_request_queue_client.py" + }, + "1224": { + "qualifiedName": "update", + "sourceFileName": "/crawlee/storage_clients/_base/_request_queue_client.py" + }, + "1227": { + "qualifiedName": "delete", + "sourceFileName": "/crawlee/storage_clients/_base/_request_queue_client.py" + }, + "1229": { + "qualifiedName": "list_head", + "sourceFileName": "/crawlee/storage_clients/_base/_request_queue_client.py" + }, + "1232": { + "qualifiedName": "list_and_lock_head", + "sourceFileName": "/crawlee/storage_clients/_base/_request_queue_client.py" + }, + "1236": { + "qualifiedName": "add_request", + "sourceFileName": "/crawlee/storage_clients/_base/_request_queue_client.py" + }, + "1240": { + "qualifiedName": "batch_add_requests", + "sourceFileName": "/crawlee/storage_clients/_base/_request_queue_client.py" + }, + "1244": { + "qualifiedName": "get_request", + "sourceFileName": "/crawlee/storage_clients/_base/_request_queue_client.py" + }, + "1247": { + "qualifiedName": "update_request", + "sourceFileName": "/crawlee/storage_clients/_base/_request_queue_client.py" + }, + "1251": { + "qualifiedName": "delete_request", + "sourceFileName": "/crawlee/storage_clients/_base/_request_queue_client.py" + }, + "1254": { + "qualifiedName": "prolong_request_lock", + "sourceFileName": "/crawlee/storage_clients/_base/_request_queue_client.py" + }, + "1259": { + "qualifiedName": "delete_request_lock", + "sourceFileName": "/crawlee/storage_clients/_base/_request_queue_client.py" + }, + "1263": { + "qualifiedName": "batch_delete_requests", + "sourceFileName": "/crawlee/storage_clients/_base/_request_queue_client.py" + }, + "1266": { + "qualifiedName": "KeyValueStoreCollectionClient", + "sourceFileName": "/crawlee/storage_clients/_base/_key_value_store_collection_client.py" + }, + "1267": { + "qualifiedName": "get_or_create", + "sourceFileName": "/crawlee/storage_clients/_base/_key_value_store_collection_client.py" + }, + "1272": { + "qualifiedName": "list", + "sourceFileName": "/crawlee/storage_clients/_base/_key_value_store_collection_client.py" + }, + "1278": { + "qualifiedName": "KeyValueStoreClient", + "sourceFileName": "/crawlee/storage_clients/_base/_key_value_store_client.py" + }, + "1279": { + "qualifiedName": "get", + "sourceFileName": "/crawlee/storage_clients/_base/_key_value_store_client.py" + }, + "1281": { + "qualifiedName": "update", + "sourceFileName": "/crawlee/storage_clients/_base/_key_value_store_client.py" + }, + "1284": { + "qualifiedName": "delete", + "sourceFileName": "/crawlee/storage_clients/_base/_key_value_store_client.py" + }, + "1286": { + "qualifiedName": "list_keys", + "sourceFileName": "/crawlee/storage_clients/_base/_key_value_store_client.py" + }, + "1290": { + "qualifiedName": "get_record", + "sourceFileName": "/crawlee/storage_clients/_base/_key_value_store_client.py" + }, + "1293": { + "qualifiedName": "get_record_as_bytes", + "sourceFileName": "/crawlee/storage_clients/_base/_key_value_store_client.py" + }, + "1296": { + "qualifiedName": "stream_record", + "sourceFileName": "/crawlee/storage_clients/_base/_key_value_store_client.py" + }, + "1299": { + "qualifiedName": "set_record", + "sourceFileName": "/crawlee/storage_clients/_base/_key_value_store_client.py" + }, + "1304": { + "qualifiedName": "delete_record", + "sourceFileName": "/crawlee/storage_clients/_base/_key_value_store_client.py" + }, + "1307": { + "qualifiedName": "get_public_url", + "sourceFileName": "/crawlee/storage_clients/_base/_key_value_store_client.py" + }, + "1310": { + "qualifiedName": "DatasetCollectionClient", + "sourceFileName": "/crawlee/storage_clients/_base/_dataset_collection_client.py" + }, + "1311": { + "qualifiedName": "get_or_create", + "sourceFileName": "/crawlee/storage_clients/_base/_dataset_collection_client.py" + }, + "1316": { + "qualifiedName": "list", + "sourceFileName": "/crawlee/storage_clients/_base/_dataset_collection_client.py" + }, + "1322": { + "qualifiedName": "DatasetClient", + "sourceFileName": "/crawlee/storage_clients/_base/_dataset_client.py" + }, + "1323": { + "qualifiedName": "get", + "sourceFileName": "/crawlee/storage_clients/_base/_dataset_client.py" + }, + "1325": { + "qualifiedName": "update", + "sourceFileName": "/crawlee/storage_clients/_base/_dataset_client.py" + }, + "1328": { + "qualifiedName": "delete", + "sourceFileName": "/crawlee/storage_clients/_base/_dataset_client.py" + }, + "1330": { + "qualifiedName": "list_items", + "sourceFileName": "/crawlee/storage_clients/_base/_dataset_client.py" + }, + "1343": { + "qualifiedName": "iterate_items", + "sourceFileName": "/crawlee/storage_clients/_base/_dataset_client.py" + }, + "1354": { + "qualifiedName": "get_items_as_bytes", + "sourceFileName": "/crawlee/storage_clients/_base/_dataset_client.py" + }, + "1372": { + "qualifiedName": "stream_items", + "sourceFileName": "/crawlee/storage_clients/_base/_dataset_client.py" + }, + "1389": { + "qualifiedName": "push_items", + "sourceFileName": "/crawlee/storage_clients/_base/_dataset_client.py" + }, + "1392": { + "qualifiedName": "TStatisticsState", + "sourceFileName": "/crawlee/statistics/_statistics.py" + }, + "1393": { + "qualifiedName": "TNewStatisticsState", + "sourceFileName": "/crawlee/statistics/_statistics.py" + }, + "1394": { + "qualifiedName": "logger", + "sourceFileName": "/crawlee/statistics/_statistics.py" + }, + "1395": { + "qualifiedName": "RequestProcessingRecord", + "sourceFileName": "/crawlee/statistics/_statistics.py" + }, + "1396": { + "qualifiedName": "__init__", + "sourceFileName": "/crawlee/statistics/_statistics.py" + }, + "1398": { + "qualifiedName": "run", + "sourceFileName": "/crawlee/statistics/_statistics.py" + }, + "1400": { + "qualifiedName": "finish", + "sourceFileName": "/crawlee/statistics/_statistics.py" + }, + "1402": { + "qualifiedName": "retry_count", + "sourceFileName": "/crawlee/statistics/_statistics.py" + }, + "1403": { + "qualifiedName": "Statistics", + "sourceFileName": "/crawlee/statistics/_statistics.py" + }, + "1404": { + "qualifiedName": "__init__", + "sourceFileName": "/crawlee/statistics/_statistics.py" + }, + "1415": { + "qualifiedName": "replace_state_model", + "sourceFileName": "/crawlee/statistics/_statistics.py" + }, + "1418": { + "qualifiedName": "with_default_state", + "sourceFileName": "/crawlee/statistics/_statistics.py" + }, + "1428": { + "qualifiedName": "active", + "sourceFileName": "/crawlee/statistics/_statistics.py" + }, + "1429": { + "qualifiedName": "__aenter__", + "sourceFileName": "/crawlee/statistics/_statistics.py" + }, + "1431": { + "qualifiedName": "__aexit__", + "sourceFileName": "/crawlee/statistics/_statistics.py" + }, + "1436": { + "qualifiedName": "state", + "sourceFileName": "/crawlee/statistics/_statistics.py" + }, + "1437": { + "qualifiedName": "register_status_code", + "sourceFileName": "/crawlee/statistics/_statistics.py" + }, + "1440": { + "qualifiedName": "record_request_processing_start", + "sourceFileName": "/crawlee/statistics/_statistics.py" + }, + "1443": { + "qualifiedName": "record_request_processing_finish", + "sourceFileName": "/crawlee/statistics/_statistics.py" + }, + "1446": { + "qualifiedName": "record_request_processing_failure", + "sourceFileName": "/crawlee/statistics/_statistics.py" + }, + "1449": { + "qualifiedName": "calculate", + "sourceFileName": "/crawlee/statistics/_statistics.py" + }, + "1451": { + "qualifiedName": "reset", + "sourceFileName": "/crawlee/statistics/_statistics.py" + }, + "1453": { + "qualifiedName": "FinalStatistics", + "sourceFileName": "/crawlee/statistics/_models.py" + }, + "1454": { + "qualifiedName": "requests_finished", + "sourceFileName": "/crawlee/statistics/_models.py" + }, + "1455": { + "qualifiedName": "requests_failed", + "sourceFileName": "/crawlee/statistics/_models.py" + }, + "1456": { + "qualifiedName": "retry_histogram", + "sourceFileName": "/crawlee/statistics/_models.py" + }, + "1457": { + "qualifiedName": "request_avg_failed_duration", + "sourceFileName": "/crawlee/statistics/_models.py" + }, + "1458": { + "qualifiedName": "request_avg_finished_duration", + "sourceFileName": "/crawlee/statistics/_models.py" + }, + "1459": { + "qualifiedName": "requests_finished_per_minute", + "sourceFileName": "/crawlee/statistics/_models.py" + }, + "1460": { + "qualifiedName": "requests_failed_per_minute", + "sourceFileName": "/crawlee/statistics/_models.py" + }, + "1461": { + "qualifiedName": "request_total_duration", + "sourceFileName": "/crawlee/statistics/_models.py" + }, + "1462": { + "qualifiedName": "requests_total", + "sourceFileName": "/crawlee/statistics/_models.py" + }, + "1463": { + "qualifiedName": "crawler_runtime", + "sourceFileName": "/crawlee/statistics/_models.py" + }, + "1464": { + "qualifiedName": "to_table", + "sourceFileName": "/crawlee/statistics/_models.py" + }, + "1466": { + "qualifiedName": "to_dict", + "sourceFileName": "/crawlee/statistics/_models.py" + }, + "1468": { + "qualifiedName": "__str__", + "sourceFileName": "/crawlee/statistics/_models.py" + }, + "1470": { + "qualifiedName": "StatisticsState", + "sourceFileName": "/crawlee/statistics/_models.py" + }, + "1471": { + "qualifiedName": "model_config", + "sourceFileName": "/crawlee/statistics/_models.py" + }, + "1472": { + "qualifiedName": "stats_id", + "sourceFileName": "/crawlee/statistics/_models.py" + }, + "1473": { + "qualifiedName": "requests_finished", + "sourceFileName": "/crawlee/statistics/_models.py" + }, + "1474": { + "qualifiedName": "requests_failed", + "sourceFileName": "/crawlee/statistics/_models.py" + }, + "1475": { + "qualifiedName": "requests_retries", + "sourceFileName": "/crawlee/statistics/_models.py" + }, + "1476": { + "qualifiedName": "requests_failed_per_minute", + "sourceFileName": "/crawlee/statistics/_models.py" + }, + "1477": { + "qualifiedName": "requests_finished_per_minute", + "sourceFileName": "/crawlee/statistics/_models.py" + }, + "1478": { + "qualifiedName": "request_min_duration", + "sourceFileName": "/crawlee/statistics/_models.py" + }, + "1479": { + "qualifiedName": "request_max_duration", + "sourceFileName": "/crawlee/statistics/_models.py" + }, + "1480": { + "qualifiedName": "request_total_failed_duration", + "sourceFileName": "/crawlee/statistics/_models.py" + }, + "1481": { + "qualifiedName": "request_total_finished_duration", + "sourceFileName": "/crawlee/statistics/_models.py" + }, + "1482": { + "qualifiedName": "crawler_started_at", + "sourceFileName": "/crawlee/statistics/_models.py" + }, + "1483": { + "qualifiedName": "crawler_last_started_at", + "sourceFileName": "/crawlee/statistics/_models.py" + }, + "1484": { + "qualifiedName": "crawler_finished_at", + "sourceFileName": "/crawlee/statistics/_models.py" + }, + "1485": { + "qualifiedName": "crawler_runtime", + "sourceFileName": "/crawlee/statistics/_models.py" + }, + "1486": { + "qualifiedName": "errors", + "sourceFileName": "/crawlee/statistics/_models.py" + }, + "1487": { + "qualifiedName": "retry_errors", + "sourceFileName": "/crawlee/statistics/_models.py" + }, + "1488": { + "qualifiedName": "requests_with_status_code", + "sourceFileName": "/crawlee/statistics/_models.py" + }, + "1489": { + "qualifiedName": "stats_persisted_at", + "sourceFileName": "/crawlee/statistics/_models.py" + }, + "1490": { + "qualifiedName": "request_retry_histogram", + "sourceFileName": "/crawlee/statistics/_models.py" + }, + "1491": { + "qualifiedName": "request_total_duration", + "sourceFileName": "/crawlee/statistics/_models.py" + }, + "1492": { + "qualifiedName": "request_avg_failed_duration", + "sourceFileName": "/crawlee/statistics/_models.py" + }, + "1493": { + "qualifiedName": "request_avg_finished_duration", + "sourceFileName": "/crawlee/statistics/_models.py" + }, + "1494": { + "qualifiedName": "requests_total", + "sourceFileName": "/crawlee/statistics/_models.py" + }, + "1495": { + "qualifiedName": "GroupName", + "sourceFileName": "/crawlee/statistics/_error_tracker.py" + }, + "1496": { + "qualifiedName": "ErrorFilenameGroups", + "sourceFileName": "/crawlee/statistics/_error_tracker.py" + }, + "1497": { + "qualifiedName": "logger", + "sourceFileName": "/crawlee/statistics/_error_tracker.py" + }, + "1498": { + "qualifiedName": "ErrorTracker", + "sourceFileName": "/crawlee/statistics/_error_tracker.py" + }, + "1499": { + "qualifiedName": "__init__", + "sourceFileName": "/crawlee/statistics/_error_tracker.py" + }, + "1507": { + "qualifiedName": "add", + "sourceFileName": "/crawlee/statistics/_error_tracker.py" + }, + "1512": { + "qualifiedName": "unique_error_count", + "sourceFileName": "/crawlee/statistics/_error_tracker.py" + }, + "1513": { + "qualifiedName": "total", + "sourceFileName": "/crawlee/statistics/_error_tracker.py" + }, + "1514": { + "qualifiedName": "get_most_common_errors", + "sourceFileName": "/crawlee/statistics/_error_tracker.py" + }, + "1517": { + "qualifiedName": "ErrorSnapshotter", + "sourceFileName": "/crawlee/statistics/_error_snapshotter.py" + }, + "1518": { + "qualifiedName": "MAX_ERROR_CHARACTERS", + "sourceFileName": "/crawlee/statistics/_error_snapshotter.py" + }, + "1519": { + "qualifiedName": "MAX_HASH_LENGTH", + "sourceFileName": "/crawlee/statistics/_error_snapshotter.py" + }, + "1520": { + "qualifiedName": "MAX_FILENAME_LENGTH", + "sourceFileName": "/crawlee/statistics/_error_snapshotter.py" + }, + "1521": { + "qualifiedName": "BASE_MESSAGE", + "sourceFileName": "/crawlee/statistics/_error_snapshotter.py" + }, + "1522": { + "qualifiedName": "SNAPSHOT_PREFIX", + "sourceFileName": "/crawlee/statistics/_error_snapshotter.py" + }, + "1523": { + "qualifiedName": "ALLOWED_CHARACTERS", + "sourceFileName": "/crawlee/statistics/_error_snapshotter.py" + }, + "1524": { + "qualifiedName": "__init__", + "sourceFileName": "/crawlee/statistics/_error_snapshotter.py" + }, + "1527": { + "qualifiedName": "capture_snapshot", + "sourceFileName": "/crawlee/statistics/_error_snapshotter.py" + }, + "1532": { + "qualifiedName": "logger", + "sourceFileName": "/crawlee/sessions/_session_pool.py" + }, + "1533": { + "qualifiedName": "CreateSessionFunctionType", + "sourceFileName": "/crawlee/sessions/_session_pool.py" + }, + "1534": { + "qualifiedName": "SessionPool", + "sourceFileName": "/crawlee/sessions/_session_pool.py" + }, + "1535": { + "qualifiedName": "__init__", + "sourceFileName": "/crawlee/sessions/_session_pool.py" + }, + "1544": { + "qualifiedName": "__repr__", + "sourceFileName": "/crawlee/sessions/_session_pool.py" + }, + "1546": { + "qualifiedName": "session_count", + "sourceFileName": "/crawlee/sessions/_session_pool.py" + }, + "1547": { + "qualifiedName": "usable_session_count", + "sourceFileName": "/crawlee/sessions/_session_pool.py" + }, + "1548": { + "qualifiedName": "retired_session_count", + "sourceFileName": "/crawlee/sessions/_session_pool.py" + }, + "1549": { + "qualifiedName": "active", + "sourceFileName": "/crawlee/sessions/_session_pool.py" + }, + "1550": { + "qualifiedName": "__aenter__", + "sourceFileName": "/crawlee/sessions/_session_pool.py" + }, + "1552": { + "qualifiedName": "__aexit__", + "sourceFileName": "/crawlee/sessions/_session_pool.py" + }, + "1557": { + "qualifiedName": "get_state", + "sourceFileName": "/crawlee/sessions/_session_pool.py" + }, + "1560": { + "qualifiedName": "add_session", + "sourceFileName": "/crawlee/sessions/_session_pool.py" + }, + "1563": { + "qualifiedName": "get_session", + "sourceFileName": "/crawlee/sessions/_session_pool.py" + }, + "1565": { + "qualifiedName": "get_session_by_id", + "sourceFileName": "/crawlee/sessions/_session_pool.py" + }, + "1568": { + "qualifiedName": "reset_store", + "sourceFileName": "/crawlee/sessions/_session_pool.py" + }, + "1574": { + "qualifiedName": "logger", + "sourceFileName": "/crawlee/sessions/_session.py" + }, + "1575": { + "qualifiedName": "Session", + "sourceFileName": "/crawlee/sessions/_session.py" + }, + "1576": { + "qualifiedName": "__init__", + "sourceFileName": "/crawlee/sessions/_session.py" + }, + "1589": { + "qualifiedName": "from_model", + "sourceFileName": "/crawlee/sessions/_session.py" + }, + "1592": { + "qualifiedName": "__repr__", + "sourceFileName": "/crawlee/sessions/_session.py" + }, + "1594": { + "qualifiedName": "__eq__", + "sourceFileName": "/crawlee/sessions/_session.py" + }, + "1597": { + "qualifiedName": "id", + "sourceFileName": "/crawlee/sessions/_session.py" + }, + "1598": { + "qualifiedName": "user_data", + "sourceFileName": "/crawlee/sessions/_session.py" + }, + "1599": { + "qualifiedName": "cookies", + "sourceFileName": "/crawlee/sessions/_session.py" + }, + "1600": { + "qualifiedName": "error_score", + "sourceFileName": "/crawlee/sessions/_session.py" + }, + "1601": { + "qualifiedName": "usage_count", + "sourceFileName": "/crawlee/sessions/_session.py" + }, + "1602": { + "qualifiedName": "expires_at", + "sourceFileName": "/crawlee/sessions/_session.py" + }, + "1603": { + "qualifiedName": "is_blocked", + "sourceFileName": "/crawlee/sessions/_session.py" + }, + "1604": { + "qualifiedName": "is_expired", + "sourceFileName": "/crawlee/sessions/_session.py" + }, + "1605": { + "qualifiedName": "is_max_usage_count_reached", + "sourceFileName": "/crawlee/sessions/_session.py" + }, + "1606": { + "qualifiedName": "is_usable", + "sourceFileName": "/crawlee/sessions/_session.py" + }, + "1607": { + "qualifiedName": "get_state", + "sourceFileName": "/crawlee/sessions/_session.py" + }, + "1610": { + "qualifiedName": "mark_good", + "sourceFileName": "/crawlee/sessions/_session.py" + }, + "1612": { + "qualifiedName": "mark_bad", + "sourceFileName": "/crawlee/sessions/_session.py" + }, + "1614": { + "qualifiedName": "retire", + "sourceFileName": "/crawlee/sessions/_session.py" + }, + "1616": { + "qualifiedName": "is_blocked_status_code", + "sourceFileName": "/crawlee/sessions/_session.py" + }, + "1624": { + "qualifiedName": "SessionModel", + "sourceFileName": "/crawlee/sessions/_models.py" + }, + "1625": { + "qualifiedName": "model_config", + "sourceFileName": "/crawlee/sessions/_models.py" + }, + "1626": { + "qualifiedName": "id", + "sourceFileName": "/crawlee/sessions/_models.py" + }, + "1627": { + "qualifiedName": "max_age", + "sourceFileName": "/crawlee/sessions/_models.py" + }, + "1628": { + "qualifiedName": "user_data", + "sourceFileName": "/crawlee/sessions/_models.py" + }, + "1629": { + "qualifiedName": "max_error_score", + "sourceFileName": "/crawlee/sessions/_models.py" + }, + "1630": { + "qualifiedName": "error_score_decrement", + "sourceFileName": "/crawlee/sessions/_models.py" + }, + "1631": { + "qualifiedName": "created_at", + "sourceFileName": "/crawlee/sessions/_models.py" + }, + "1632": { + "qualifiedName": "usage_count", + "sourceFileName": "/crawlee/sessions/_models.py" + }, + "1633": { + "qualifiedName": "max_usage_count", + "sourceFileName": "/crawlee/sessions/_models.py" + }, + "1634": { + "qualifiedName": "error_score", + "sourceFileName": "/crawlee/sessions/_models.py" + }, + "1635": { + "qualifiedName": "cookies", + "sourceFileName": "/crawlee/sessions/_models.py" + }, + "1636": { + "qualifiedName": "blocked_status_codes", + "sourceFileName": "/crawlee/sessions/_models.py" + }, + "1637": { + "qualifiedName": "SessionPoolModel", + "sourceFileName": "/crawlee/sessions/_models.py" + }, + "1638": { + "qualifiedName": "model_config", + "sourceFileName": "/crawlee/sessions/_models.py" + }, + "1639": { + "qualifiedName": "max_pool_size", + "sourceFileName": "/crawlee/sessions/_models.py" + }, + "1640": { + "qualifiedName": "sessions", + "sourceFileName": "/crawlee/sessions/_models.py" + }, + "1641": { + "qualifiedName": "session_count", + "sourceFileName": "/crawlee/sessions/_models.py" + }, + "1642": { + "qualifiedName": "usable_session_count", + "sourceFileName": "/crawlee/sessions/_models.py" + }, + "1643": { + "qualifiedName": "retired_session_count", + "sourceFileName": "/crawlee/sessions/_models.py" + }, + "1644": { + "qualifiedName": "CookieParam", + "sourceFileName": "/crawlee/sessions/_cookies.py" + }, + "1645": { + "qualifiedName": "name", + "sourceFileName": "/crawlee/sessions/_cookies.py" + }, + "1646": { + "qualifiedName": "value", + "sourceFileName": "/crawlee/sessions/_cookies.py" + }, + "1647": { + "qualifiedName": "domain", + "sourceFileName": "/crawlee/sessions/_cookies.py" + }, + "1648": { + "qualifiedName": "path", + "sourceFileName": "/crawlee/sessions/_cookies.py" + }, + "1649": { + "qualifiedName": "secure", + "sourceFileName": "/crawlee/sessions/_cookies.py" + }, + "1650": { + "qualifiedName": "http_only", + "sourceFileName": "/crawlee/sessions/_cookies.py" + }, + "1651": { + "qualifiedName": "expires", + "sourceFileName": "/crawlee/sessions/_cookies.py" + }, + "1652": { + "qualifiedName": "same_site", + "sourceFileName": "/crawlee/sessions/_cookies.py" + }, + "1653": { + "qualifiedName": "PlaywrightCookieParam", + "sourceFileName": "/crawlee/sessions/_cookies.py" + }, + "1654": { + "qualifiedName": "name", + "sourceFileName": "/crawlee/sessions/_cookies.py" + }, + "1655": { + "qualifiedName": "value", + "sourceFileName": "/crawlee/sessions/_cookies.py" + }, + "1656": { + "qualifiedName": "domain", + "sourceFileName": "/crawlee/sessions/_cookies.py" + }, + "1657": { + "qualifiedName": "path", + "sourceFileName": "/crawlee/sessions/_cookies.py" + }, + "1658": { + "qualifiedName": "secure", + "sourceFileName": "/crawlee/sessions/_cookies.py" + }, + "1659": { + "qualifiedName": "httpOnly", + "sourceFileName": "/crawlee/sessions/_cookies.py" + }, + "1660": { + "qualifiedName": "expires", + "sourceFileName": "/crawlee/sessions/_cookies.py" + }, + "1661": { + "qualifiedName": "sameSite", + "sourceFileName": "/crawlee/sessions/_cookies.py" + }, + "1662": { + "qualifiedName": "partitionKey", + "sourceFileName": "/crawlee/sessions/_cookies.py" + }, + "1663": { + "qualifiedName": "SessionCookies", + "sourceFileName": "/crawlee/sessions/_cookies.py" + }, + "1664": { + "qualifiedName": "__init__", + "sourceFileName": "/crawlee/sessions/_cookies.py" + }, + "1667": { + "qualifiedName": "jar", + "sourceFileName": "/crawlee/sessions/_cookies.py" + }, + "1668": { + "qualifiedName": "set", + "sourceFileName": "/crawlee/sessions/_cookies.py" + }, + "1679": { + "qualifiedName": "get_cookies_as_dicts", + "sourceFileName": "/crawlee/sessions/_cookies.py" + }, + "1681": { + "qualifiedName": "store_cookie", + "sourceFileName": "/crawlee/sessions/_cookies.py" + }, + "1684": { + "qualifiedName": "store_cookies", + "sourceFileName": "/crawlee/sessions/_cookies.py" + }, + "1687": { + "qualifiedName": "set_cookies", + "sourceFileName": "/crawlee/sessions/_cookies.py" + }, + "1690": { + "qualifiedName": "get_cookies_as_playwright_format", + "sourceFileName": "/crawlee/sessions/_cookies.py" + }, + "1692": { + "qualifiedName": "set_cookies_from_playwright_format", + "sourceFileName": "/crawlee/sessions/_cookies.py" + }, + "1695": { + "qualifiedName": "__deepcopy__", + "sourceFileName": "/crawlee/sessions/_cookies.py" + }, + "1698": { + "qualifiedName": "__len__", + "sourceFileName": "/crawlee/sessions/_cookies.py" + }, + "1700": { + "qualifiedName": "__setitem__", + "sourceFileName": "/crawlee/sessions/_cookies.py" + }, + "1704": { + "qualifiedName": "__getitem__", + "sourceFileName": "/crawlee/sessions/_cookies.py" + }, + "1707": { + "qualifiedName": "__iter__", + "sourceFileName": "/crawlee/sessions/_cookies.py" + }, + "1709": { + "qualifiedName": "__repr__", + "sourceFileName": "/crawlee/sessions/_cookies.py" + }, + "1711": { + "qualifiedName": "__bool__", + "sourceFileName": "/crawlee/sessions/_cookies.py" + }, + "1713": { + "qualifiedName": "__eq__", + "sourceFileName": "/crawlee/sessions/_cookies.py" + }, + "1716": { + "qualifiedName": "logger", + "sourceFileName": "/crawlee/request_loaders/_request_manager_tandem.py" + }, + "1717": { + "qualifiedName": "RequestManagerTandem", + "sourceFileName": "/crawlee/request_loaders/_request_manager_tandem.py" + }, + "1718": { + "qualifiedName": "__init__", + "sourceFileName": "/crawlee/request_loaders/_request_manager_tandem.py" + }, + "1722": { + "qualifiedName": "get_total_count", + "sourceFileName": "/crawlee/request_loaders/_request_manager_tandem.py" + }, + "1724": { + "qualifiedName": "is_empty", + "sourceFileName": "/crawlee/request_loaders/_request_manager_tandem.py" + }, + "1726": { + "qualifiedName": "is_finished", + "sourceFileName": "/crawlee/request_loaders/_request_manager_tandem.py" + }, + "1728": { + "qualifiedName": "add_request", + "sourceFileName": "/crawlee/request_loaders/_request_manager_tandem.py" + }, + "1732": { + "qualifiedName": "add_requests_batched", + "sourceFileName": "/crawlee/request_loaders/_request_manager_tandem.py" + }, + "1739": { + "qualifiedName": "fetch_next_request", + "sourceFileName": "/crawlee/request_loaders/_request_manager_tandem.py" + }, + "1741": { + "qualifiedName": "reclaim_request", + "sourceFileName": "/crawlee/request_loaders/_request_manager_tandem.py" + }, + "1745": { + "qualifiedName": "mark_request_as_handled", + "sourceFileName": "/crawlee/request_loaders/_request_manager_tandem.py" + }, + "1748": { + "qualifiedName": "get_handled_count", + "sourceFileName": "/crawlee/request_loaders/_request_manager_tandem.py" + }, + "1750": { + "qualifiedName": "drop", + "sourceFileName": "/crawlee/request_loaders/_request_manager_tandem.py" + }, + "1752": { + "qualifiedName": "RequestManager", + "sourceFileName": "/crawlee/request_loaders/_request_manager.py" + }, + "1753": { + "qualifiedName": "drop", + "sourceFileName": "/crawlee/request_loaders/_request_manager.py" + }, + "1755": { + "qualifiedName": "add_request", + "sourceFileName": "/crawlee/request_loaders/_request_manager.py" + }, + "1759": { + "qualifiedName": "add_requests_batched", + "sourceFileName": "/crawlee/request_loaders/_request_manager.py" + }, + "1766": { + "qualifiedName": "reclaim_request", + "sourceFileName": "/crawlee/request_loaders/_request_manager.py" + }, + "1770": { + "qualifiedName": "RequestLoader", + "sourceFileName": "/crawlee/request_loaders/_request_loader.py" + }, + "1771": { + "qualifiedName": "get_total_count", + "sourceFileName": "/crawlee/request_loaders/_request_loader.py" + }, + "1773": { + "qualifiedName": "is_empty", + "sourceFileName": "/crawlee/request_loaders/_request_loader.py" + }, + "1775": { + "qualifiedName": "is_finished", + "sourceFileName": "/crawlee/request_loaders/_request_loader.py" + }, + "1777": { + "qualifiedName": "fetch_next_request", + "sourceFileName": "/crawlee/request_loaders/_request_loader.py" + }, + "1779": { + "qualifiedName": "mark_request_as_handled", + "sourceFileName": "/crawlee/request_loaders/_request_loader.py" + }, + "1782": { + "qualifiedName": "get_handled_count", + "sourceFileName": "/crawlee/request_loaders/_request_loader.py" + }, + "1784": { + "qualifiedName": "to_tandem", + "sourceFileName": "/crawlee/request_loaders/_request_loader.py" + }, + "1787": { + "qualifiedName": "RequestList", + "sourceFileName": "/crawlee/request_loaders/_request_list.py" + }, + "1788": { + "qualifiedName": "__init__", + "sourceFileName": "/crawlee/request_loaders/_request_list.py" + }, + "1792": { + "qualifiedName": "name", + "sourceFileName": "/crawlee/request_loaders/_request_list.py" + }, + "1793": { + "qualifiedName": "get_total_count", + "sourceFileName": "/crawlee/request_loaders/_request_list.py" + }, + "1795": { + "qualifiedName": "is_empty", + "sourceFileName": "/crawlee/request_loaders/_request_list.py" + }, + "1797": { + "qualifiedName": "is_finished", + "sourceFileName": "/crawlee/request_loaders/_request_list.py" + }, + "1799": { + "qualifiedName": "fetch_next_request", + "sourceFileName": "/crawlee/request_loaders/_request_list.py" + }, + "1801": { + "qualifiedName": "mark_request_as_handled", + "sourceFileName": "/crawlee/request_loaders/_request_list.py" + }, + "1804": { + "qualifiedName": "get_handled_count", + "sourceFileName": "/crawlee/request_loaders/_request_list.py" + }, + "1806": { + "qualifiedName": "logger", + "sourceFileName": "/crawlee/http_clients/_httpx.py" + }, + "1807": { + "qualifiedName": "_HttpxResponse", + "sourceFileName": "/crawlee/http_clients/_httpx.py" + }, + "1808": { + "qualifiedName": "__init__", + "sourceFileName": "/crawlee/http_clients/_httpx.py" + }, + "1811": { + "qualifiedName": "http_version", + "sourceFileName": "/crawlee/http_clients/_httpx.py" + }, + "1812": { + "qualifiedName": "status_code", + "sourceFileName": "/crawlee/http_clients/_httpx.py" + }, + "1813": { + "qualifiedName": "headers", + "sourceFileName": "/crawlee/http_clients/_httpx.py" + }, + "1814": { + "qualifiedName": "read", + "sourceFileName": "/crawlee/http_clients/_httpx.py" + }, + "1816": { + "qualifiedName": "read_stream", + "sourceFileName": "/crawlee/http_clients/_httpx.py" + }, + "1818": { + "qualifiedName": "_HttpxTransport", + "sourceFileName": "/crawlee/http_clients/_httpx.py" + }, + "1819": { + "qualifiedName": "handle_async_request", + "sourceFileName": "/crawlee/http_clients/_httpx.py" + }, + "1822": { + "qualifiedName": "HttpxHttpClient", + "sourceFileName": "/crawlee/http_clients/_httpx.py" + }, + "1823": { + "qualifiedName": "__init__", + "sourceFileName": "/crawlee/http_clients/_httpx.py" + }, + "1831": { + "qualifiedName": "crawl", + "sourceFileName": "/crawlee/http_clients/_httpx.py" + }, + "1837": { + "qualifiedName": "send_request", + "sourceFileName": "/crawlee/http_clients/_httpx.py" + }, + "1845": { + "qualifiedName": "stream", + "sourceFileName": "/crawlee/http_clients/_httpx.py" + }, + "1854": { + "qualifiedName": "cleanup", + "sourceFileName": "/crawlee/http_clients/_httpx.py" + }, + "1856": { + "qualifiedName": "_EmptyCookies", + "sourceFileName": "/crawlee/http_clients/_curl_impersonate.py" + }, + "1857": { + "qualifiedName": "get_cookies_for_curl", + "sourceFileName": "/crawlee/http_clients/_curl_impersonate.py" + }, + "1860": { + "qualifiedName": "update_cookies_from_curl", + "sourceFileName": "/crawlee/http_clients/_curl_impersonate.py" + }, + "1863": { + "qualifiedName": "_AsyncSession", + "sourceFileName": "/crawlee/http_clients/_curl_impersonate.py" + }, + "1864": { + "qualifiedName": "__init__", + "sourceFileName": "/crawlee/http_clients/_curl_impersonate.py" + }, + "1868": { + "qualifiedName": "_CurlImpersonateResponse", + "sourceFileName": "/crawlee/http_clients/_curl_impersonate.py" + }, + "1869": { + "qualifiedName": "__init__", + "sourceFileName": "/crawlee/http_clients/_curl_impersonate.py" + }, + "1872": { + "qualifiedName": "http_version", + "sourceFileName": "/crawlee/http_clients/_curl_impersonate.py" + }, + "1873": { + "qualifiedName": "status_code", + "sourceFileName": "/crawlee/http_clients/_curl_impersonate.py" + }, + "1874": { + "qualifiedName": "headers", + "sourceFileName": "/crawlee/http_clients/_curl_impersonate.py" + }, + "1875": { + "qualifiedName": "read", + "sourceFileName": "/crawlee/http_clients/_curl_impersonate.py" + }, + "1877": { + "qualifiedName": "read_stream", + "sourceFileName": "/crawlee/http_clients/_curl_impersonate.py" + }, + "1879": { + "qualifiedName": "CurlImpersonateHttpClient", + "sourceFileName": "/crawlee/http_clients/_curl_impersonate.py" + }, + "1880": { + "qualifiedName": "__init__", + "sourceFileName": "/crawlee/http_clients/_curl_impersonate.py" + }, + "1884": { + "qualifiedName": "crawl", + "sourceFileName": "/crawlee/http_clients/_curl_impersonate.py" + }, + "1890": { + "qualifiedName": "send_request", + "sourceFileName": "/crawlee/http_clients/_curl_impersonate.py" + }, + "1898": { + "qualifiedName": "stream", + "sourceFileName": "/crawlee/http_clients/_curl_impersonate.py" + }, + "1907": { + "qualifiedName": "cleanup", + "sourceFileName": "/crawlee/http_clients/_curl_impersonate.py" + }, + "1909": { + "qualifiedName": "HttpResponse", + "sourceFileName": "/crawlee/http_clients/_base.py" + }, + "1910": { + "qualifiedName": "http_version", + "sourceFileName": "/crawlee/http_clients/_base.py" + }, + "1911": { + "qualifiedName": "status_code", + "sourceFileName": "/crawlee/http_clients/_base.py" + }, + "1912": { + "qualifiedName": "headers", + "sourceFileName": "/crawlee/http_clients/_base.py" + }, + "1913": { + "qualifiedName": "read", + "sourceFileName": "/crawlee/http_clients/_base.py" + }, + "1915": { + "qualifiedName": "read_stream", + "sourceFileName": "/crawlee/http_clients/_base.py" + }, + "1917": { + "qualifiedName": "HttpCrawlingResult", + "sourceFileName": "/crawlee/http_clients/_base.py" + }, + "1918": { + "qualifiedName": "http_response", + "sourceFileName": "/crawlee/http_clients/_base.py" + }, + "1919": { + "qualifiedName": "HttpClient", + "sourceFileName": "/crawlee/http_clients/_base.py" + }, + "1920": { + "qualifiedName": "__init__", + "sourceFileName": "/crawlee/http_clients/_base.py" + }, + "1923": { + "qualifiedName": "active", + "sourceFileName": "/crawlee/http_clients/_base.py" + }, + "1924": { + "qualifiedName": "crawl", + "sourceFileName": "/crawlee/http_clients/_base.py" + }, + "1930": { + "qualifiedName": "send_request", + "sourceFileName": "/crawlee/http_clients/_base.py" + }, + "1938": { + "qualifiedName": "stream", + "sourceFileName": "/crawlee/http_clients/_base.py" + }, + "1947": { + "qualifiedName": "cleanup", + "sourceFileName": "/crawlee/http_clients/_base.py" + }, + "1949": { + "qualifiedName": "__aenter__", + "sourceFileName": "/crawlee/http_clients/_base.py" + }, + "1951": { + "qualifiedName": "__aexit__", + "sourceFileName": "/crawlee/http_clients/_base.py" + }, + "1956": { + "qualifiedName": "SupportedOperatingSystems", + "sourceFileName": "/crawlee/fingerprint_suite/_types.py" + }, + "1957": { + "qualifiedName": "SupportedDevices", + "sourceFileName": "/crawlee/fingerprint_suite/_types.py" + }, + "1958": { + "qualifiedName": "SupportedHttpVersion", + "sourceFileName": "/crawlee/fingerprint_suite/_types.py" + }, + "1959": { + "qualifiedName": "SupportedBrowserType", + "sourceFileName": "/crawlee/fingerprint_suite/_types.py" + }, + "1960": { + "qualifiedName": "ScreenOptions", + "sourceFileName": "/crawlee/fingerprint_suite/_types.py" + }, + "1961": { + "qualifiedName": "model_config", + "sourceFileName": "/crawlee/fingerprint_suite/_types.py" + }, + "1962": { + "qualifiedName": "min_width", + "sourceFileName": "/crawlee/fingerprint_suite/_types.py" + }, + "1963": { + "qualifiedName": "max_width", + "sourceFileName": "/crawlee/fingerprint_suite/_types.py" + }, + "1964": { + "qualifiedName": "min_height", + "sourceFileName": "/crawlee/fingerprint_suite/_types.py" + }, + "1965": { + "qualifiedName": "max_height", + "sourceFileName": "/crawlee/fingerprint_suite/_types.py" + }, + "1966": { + "qualifiedName": "HeaderGeneratorOptions", + "sourceFileName": "/crawlee/fingerprint_suite/_types.py" + }, + "1967": { + "qualifiedName": "model_config", + "sourceFileName": "/crawlee/fingerprint_suite/_types.py" + }, + "1968": { + "qualifiedName": "browsers", + "sourceFileName": "/crawlee/fingerprint_suite/_types.py" + }, + "1969": { + "qualifiedName": "operating_systems", + "sourceFileName": "/crawlee/fingerprint_suite/_types.py" + }, + "1970": { + "qualifiedName": "devices", + "sourceFileName": "/crawlee/fingerprint_suite/_types.py" + }, + "1971": { + "qualifiedName": "locales", + "sourceFileName": "/crawlee/fingerprint_suite/_types.py" + }, + "1972": { + "qualifiedName": "http_version", + "sourceFileName": "/crawlee/fingerprint_suite/_types.py" + }, + "1973": { + "qualifiedName": "strict", + "sourceFileName": "/crawlee/fingerprint_suite/_types.py" + }, + "1974": { + "qualifiedName": "HeaderGenerator", + "sourceFileName": "/crawlee/fingerprint_suite/_header_generator.py" + }, + "1975": { + "qualifiedName": "__init__", + "sourceFileName": "/crawlee/fingerprint_suite/_header_generator.py" + }, + "1977": { + "qualifiedName": "get_specific_headers", + "sourceFileName": "/crawlee/fingerprint_suite/_header_generator.py" + }, + "1981": { + "qualifiedName": "get_common_headers", + "sourceFileName": "/crawlee/fingerprint_suite/_header_generator.py" + }, + "1983": { + "qualifiedName": "get_random_user_agent_header", + "sourceFileName": "/crawlee/fingerprint_suite/_header_generator.py" + }, + "1985": { + "qualifiedName": "get_user_agent_header", + "sourceFileName": "/crawlee/fingerprint_suite/_header_generator.py" + }, + "1988": { + "qualifiedName": "get_sec_ch_ua_headers", + "sourceFileName": "/crawlee/fingerprint_suite/_header_generator.py" + }, + "1991": { + "qualifiedName": "FingerprintGenerator", + "sourceFileName": "/crawlee/fingerprint_suite/_fingerprint_generator.py" + }, + "1992": { + "qualifiedName": "generate", + "sourceFileName": "/crawlee/fingerprint_suite/_fingerprint_generator.py" + }, + "1994": { + "qualifiedName": "COMMON_ACCEPT_LANGUAGE", + "sourceFileName": "/crawlee/fingerprint_suite/_consts.py" + }, + "1995": { + "qualifiedName": "BROWSER_TYPE_HEADER_KEYWORD", + "sourceFileName": "/crawlee/fingerprint_suite/_consts.py" + }, + "1996": { + "qualifiedName": "PatchedHeaderGenerator", + "sourceFileName": "/crawlee/fingerprint_suite/_browserforge_adapter.py" + }, + "1997": { + "qualifiedName": "generate", + "sourceFileName": "/crawlee/fingerprint_suite/_browserforge_adapter.py" + }, + "2007": { + "qualifiedName": "PatchedFingerprintGenerator", + "sourceFileName": "/crawlee/fingerprint_suite/_browserforge_adapter.py" + }, + "2008": { + "qualifiedName": "__init__", + "sourceFileName": "/crawlee/fingerprint_suite/_browserforge_adapter.py" + }, + "2015": { + "qualifiedName": "BrowserforgeFingerprintGenerator", + "sourceFileName": "/crawlee/fingerprint_suite/_browserforge_adapter.py" + }, + "2016": { + "qualifiedName": "__init__", + "sourceFileName": "/crawlee/fingerprint_suite/_browserforge_adapter.py" + }, + "2022": { + "qualifiedName": "generate", + "sourceFileName": "/crawlee/fingerprint_suite/_browserforge_adapter.py" + }, + "2024": { + "qualifiedName": "BrowserforgeHeaderGenerator", + "sourceFileName": "/crawlee/fingerprint_suite/_browserforge_adapter.py" + }, + "2025": { + "qualifiedName": "__init__", + "sourceFileName": "/crawlee/fingerprint_suite/_browserforge_adapter.py" + }, + "2027": { + "qualifiedName": "generate", + "sourceFileName": "/crawlee/fingerprint_suite/_browserforge_adapter.py" + }, + "2030": { + "qualifiedName": "get_available_header_network", + "sourceFileName": "/crawlee/fingerprint_suite/_browserforge_adapter.py" + }, + "2032": { + "qualifiedName": "get_available_header_values", + "sourceFileName": "/crawlee/fingerprint_suite/_browserforge_adapter.py" + }, + "2036": { + "qualifiedName": "Event", + "sourceFileName": "/crawlee/events/_types.py" + }, + "2037": { + "qualifiedName": "PERSIST_STATE", + "sourceFileName": "/crawlee/events/_types.py" + }, + "2038": { + "qualifiedName": "SYSTEM_INFO", + "sourceFileName": "/crawlee/events/_types.py" + }, + "2039": { + "qualifiedName": "MIGRATING", + "sourceFileName": "/crawlee/events/_types.py" + }, + "2040": { + "qualifiedName": "ABORTING", + "sourceFileName": "/crawlee/events/_types.py" + }, + "2041": { + "qualifiedName": "EXIT", + "sourceFileName": "/crawlee/events/_types.py" + }, + "2042": { + "qualifiedName": "SESSION_RETIRED", + "sourceFileName": "/crawlee/events/_types.py" + }, + "2043": { + "qualifiedName": "BROWSER_LAUNCHED", + "sourceFileName": "/crawlee/events/_types.py" + }, + "2044": { + "qualifiedName": "BROWSER_RETIRED", + "sourceFileName": "/crawlee/events/_types.py" + }, + "2045": { + "qualifiedName": "BROWSER_CLOSED", + "sourceFileName": "/crawlee/events/_types.py" + }, + "2046": { + "qualifiedName": "PAGE_CREATED", + "sourceFileName": "/crawlee/events/_types.py" + }, + "2047": { + "qualifiedName": "PAGE_CLOSED", + "sourceFileName": "/crawlee/events/_types.py" + }, + "2048": { + "qualifiedName": "EventPersistStateData", + "sourceFileName": "/crawlee/events/_types.py" + }, + "2049": { + "qualifiedName": "model_config", + "sourceFileName": "/crawlee/events/_types.py" + }, + "2050": { + "qualifiedName": "is_migrating", + "sourceFileName": "/crawlee/events/_types.py" + }, + "2051": { + "qualifiedName": "EventSystemInfoData", + "sourceFileName": "/crawlee/events/_types.py" + }, + "2052": { + "qualifiedName": "model_config", + "sourceFileName": "/crawlee/events/_types.py" + }, + "2053": { + "qualifiedName": "cpu_info", + "sourceFileName": "/crawlee/events/_types.py" + }, + "2054": { + "qualifiedName": "memory_info", + "sourceFileName": "/crawlee/events/_types.py" + }, + "2055": { + "qualifiedName": "EventMigratingData", + "sourceFileName": "/crawlee/events/_types.py" + }, + "2056": { + "qualifiedName": "model_config", + "sourceFileName": "/crawlee/events/_types.py" + }, + "2057": { + "qualifiedName": "time_remaining", + "sourceFileName": "/crawlee/events/_types.py" + }, + "2058": { + "qualifiedName": "EventAbortingData", + "sourceFileName": "/crawlee/events/_types.py" + }, + "2059": { + "qualifiedName": "model_config", + "sourceFileName": "/crawlee/events/_types.py" + }, + "2060": { + "qualifiedName": "EventExitData", + "sourceFileName": "/crawlee/events/_types.py" + }, + "2061": { + "qualifiedName": "model_config", + "sourceFileName": "/crawlee/events/_types.py" + }, + "2062": { + "qualifiedName": "EventData", + "sourceFileName": "/crawlee/events/_types.py" + }, + "2063": { + "qualifiedName": "WrappedListener", + "sourceFileName": "/crawlee/events/_types.py" + }, + "2064": { + "qualifiedName": "TEvent", + "sourceFileName": "/crawlee/events/_types.py" + }, + "2065": { + "qualifiedName": "EventListener", + "sourceFileName": "/crawlee/events/_types.py" + }, + "2066": { + "qualifiedName": "logger", + "sourceFileName": "/crawlee/events/_local_event_manager.py" + }, + "2067": { + "qualifiedName": "LocalEventManager", + "sourceFileName": "/crawlee/events/_local_event_manager.py" + }, + "2068": { + "qualifiedName": "__init__", + "sourceFileName": "/crawlee/events/_local_event_manager.py" + }, + "2072": { + "qualifiedName": "from_config", + "sourceFileName": "/crawlee/events/_local_event_manager.py" + }, + "2075": { + "qualifiedName": "__aenter__", + "sourceFileName": "/crawlee/events/_local_event_manager.py" + }, + "2077": { + "qualifiedName": "__aexit__", + "sourceFileName": "/crawlee/events/_local_event_manager.py" + }, + "2082": { + "qualifiedName": "logger", + "sourceFileName": "/crawlee/events/_event_manager.py" + }, + "2083": { + "qualifiedName": "EventManagerOptions", + "sourceFileName": "/crawlee/events/_event_manager.py" + }, + "2084": { + "qualifiedName": "persist_state_interval", + "sourceFileName": "/crawlee/events/_event_manager.py" + }, + "2085": { + "qualifiedName": "close_timeout", + "sourceFileName": "/crawlee/events/_event_manager.py" + }, + "2086": { + "qualifiedName": "EventManager", + "sourceFileName": "/crawlee/events/_event_manager.py" + }, + "2087": { + "qualifiedName": "__init__", + "sourceFileName": "/crawlee/events/_event_manager.py" + }, + "2091": { + "qualifiedName": "active", + "sourceFileName": "/crawlee/events/_event_manager.py" + }, + "2092": { + "qualifiedName": "__aenter__", + "sourceFileName": "/crawlee/events/_event_manager.py" + }, + "2094": { + "qualifiedName": "__aexit__", + "sourceFileName": "/crawlee/events/_event_manager.py" + }, + "2099": { + "qualifiedName": "on", + "sourceFileName": "/crawlee/events/_event_manager.py" + }, + "2103": { + "qualifiedName": "off", + "sourceFileName": "/crawlee/events/_event_manager.py" + }, + "2107": { + "qualifiedName": "emit", + "sourceFileName": "/crawlee/events/_event_manager.py" + }, + "2111": { + "qualifiedName": "wait_for_all_listeners_to_complete", + "sourceFileName": "/crawlee/events/_event_manager.py" + }, + "2150": { + "qualifiedName": "BlockedInfo", + "sourceFileName": "/crawlee/crawlers/_types.py" + }, + "2151": { + "qualifiedName": "reason", + "sourceFileName": "/crawlee/crawlers/_types.py" + }, + "2152": { + "qualifiedName": "__bool__", + "sourceFileName": "/crawlee/crawlers/_types.py" + }, + "2154": { + "qualifiedName": "infinite_scroll", + "sourceFileName": "/crawlee/crawlers/_playwright/_utils.py" + }, + "2157": { + "qualifiedName": "block_requests", + "sourceFileName": "/crawlee/crawlers/_playwright/_utils.py" + }, + "2162": { + "qualifiedName": "BlockRequestsFunction", + "sourceFileName": "/crawlee/crawlers/_playwright/_types.py" + }, + "2163": { + "qualifiedName": "__call__", + "sourceFileName": "/crawlee/crawlers/_playwright/_types.py" + }, + "2167": { + "qualifiedName": "PlaywrightHttpResponse", + "sourceFileName": "/crawlee/crawlers/_playwright/_types.py" + }, + "2168": { + "qualifiedName": "http_version", + "sourceFileName": "/crawlee/crawlers/_playwright/_types.py" + }, + "2169": { + "qualifiedName": "status_code", + "sourceFileName": "/crawlee/crawlers/_playwright/_types.py" + }, + "2170": { + "qualifiedName": "headers", + "sourceFileName": "/crawlee/crawlers/_playwright/_types.py" + }, + "2171": { + "qualifiedName": "read", + "sourceFileName": "/crawlee/crawlers/_playwright/_types.py" + }, + "2173": { + "qualifiedName": "read_stream", + "sourceFileName": "/crawlee/crawlers/_playwright/_types.py" + }, + "2175": { + "qualifiedName": "from_playwright_response", + "sourceFileName": "/crawlee/crawlers/_playwright/_types.py" + }, + "2179": { + "qualifiedName": "PlaywrightPreNavCrawlingContext", + "sourceFileName": "/crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py" + }, + "2180": { + "qualifiedName": "page", + "sourceFileName": "/crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py" + }, + "2181": { + "qualifiedName": "block_requests", + "sourceFileName": "/crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py" + }, + "2182": { + "qualifiedName": "get_snapshot", + "sourceFileName": "/crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py" + }, + "2184": { + "qualifiedName": "browser_page_context", + "sourceFileName": "/crawlee/crawlers/_playwright/_playwright_http_client.py" + }, + "2187": { + "qualifiedName": "PlaywrightHttpClient", + "sourceFileName": "/crawlee/crawlers/_playwright/_playwright_http_client.py" + }, + "2188": { + "qualifiedName": "__init__", + "sourceFileName": "/crawlee/crawlers/_playwright/_playwright_http_client.py" + }, + "2190": { + "qualifiedName": "crawl", + "sourceFileName": "/crawlee/crawlers/_playwright/_playwright_http_client.py" + }, + "2196": { + "qualifiedName": "send_request", + "sourceFileName": "/crawlee/crawlers/_playwright/_playwright_http_client.py" + }, + "2204": { + "qualifiedName": "stream", + "sourceFileName": "/crawlee/crawlers/_playwright/_playwright_http_client.py" + }, + "2213": { + "qualifiedName": "cleanup", + "sourceFileName": "/crawlee/crawlers/_playwright/_playwright_http_client.py" + }, + "2215": { + "qualifiedName": "PlaywrightCrawlingContext", + "sourceFileName": "/crawlee/crawlers/_playwright/_playwright_crawling_context.py" + }, + "2216": { + "qualifiedName": "response", + "sourceFileName": "/crawlee/crawlers/_playwright/_playwright_crawling_context.py" + }, + "2217": { + "qualifiedName": "enqueue_links", + "sourceFileName": "/crawlee/crawlers/_playwright/_playwright_crawling_context.py" + }, + "2218": { + "qualifiedName": "extract_links", + "sourceFileName": "/crawlee/crawlers/_playwright/_playwright_crawling_context.py" + }, + "2219": { + "qualifiedName": "infinite_scroll", + "sourceFileName": "/crawlee/crawlers/_playwright/_playwright_crawling_context.py" + }, + "2220": { + "qualifiedName": "TCrawlingContext", + "sourceFileName": "/crawlee/crawlers/_playwright/_playwright_crawler.py" + }, + "2221": { + "qualifiedName": "TStatisticsState", + "sourceFileName": "/crawlee/crawlers/_playwright/_playwright_crawler.py" + }, + "2222": { + "qualifiedName": "PlaywrightCrawler", + "sourceFileName": "/crawlee/crawlers/_playwright/_playwright_crawler.py" + }, + "2223": { + "qualifiedName": "__init__", + "sourceFileName": "/crawlee/crawlers/_playwright/_playwright_crawler.py" + }, + "2234": { + "qualifiedName": "pre_navigation_hook", + "sourceFileName": "/crawlee/crawlers/_playwright/_playwright_crawler.py" + }, + "2237": { + "qualifiedName": "_PlaywrightCrawlerAdditionalOptions", + "sourceFileName": "/crawlee/crawlers/_playwright/_playwright_crawler.py" + }, + "2238": { + "qualifiedName": "browser_pool", + "sourceFileName": "/crawlee/crawlers/_playwright/_playwright_crawler.py" + }, + "2239": { + "qualifiedName": "browser_type", + "sourceFileName": "/crawlee/crawlers/_playwright/_playwright_crawler.py" + }, + "2240": { + "qualifiedName": "browser_launch_options", + "sourceFileName": "/crawlee/crawlers/_playwright/_playwright_crawler.py" + }, + "2241": { + "qualifiedName": "browser_new_context_options", + "sourceFileName": "/crawlee/crawlers/_playwright/_playwright_crawler.py" + }, + "2242": { + "qualifiedName": "headless", + "sourceFileName": "/crawlee/crawlers/_playwright/_playwright_crawler.py" + }, + "2243": { + "qualifiedName": "PlaywrightCrawlerOptions", + "sourceFileName": "/crawlee/crawlers/_playwright/_playwright_crawler.py" + }, + "2244": { + "qualifiedName": "html_to_text", + "sourceFileName": "/crawlee/crawlers/_parsel/_utils.py" + }, + "2247": { + "qualifiedName": "ParselParser", + "sourceFileName": "/crawlee/crawlers/_parsel/_parsel_parser.py" + }, + "2248": { + "qualifiedName": "parse", + "sourceFileName": "/crawlee/crawlers/_parsel/_parsel_parser.py" + }, + "2251": { + "qualifiedName": "parse_text", + "sourceFileName": "/crawlee/crawlers/_parsel/_parsel_parser.py" + }, + "2254": { + "qualifiedName": "select", + "sourceFileName": "/crawlee/crawlers/_parsel/_parsel_parser.py" + }, + "2258": { + "qualifiedName": "is_matching_selector", + "sourceFileName": "/crawlee/crawlers/_parsel/_parsel_parser.py" + }, + "2262": { + "qualifiedName": "find_links", + "sourceFileName": "/crawlee/crawlers/_parsel/_parsel_parser.py" + }, + "2266": { + "qualifiedName": "ParselCrawlingContext", + "sourceFileName": "/crawlee/crawlers/_parsel/_parsel_crawling_context.py" + }, + "2267": { + "qualifiedName": "selector", + "sourceFileName": "/crawlee/crawlers/_parsel/_parsel_crawling_context.py" + }, + "2268": { + "qualifiedName": "from_parsed_http_crawling_context", + "sourceFileName": "/crawlee/crawlers/_parsel/_parsel_crawling_context.py" + }, + "2271": { + "qualifiedName": "html_to_text", + "sourceFileName": "/crawlee/crawlers/_parsel/_parsel_crawling_context.py" + }, + "2273": { + "qualifiedName": "ParselCrawler", + "sourceFileName": "/crawlee/crawlers/_parsel/_parsel_crawler.py" + }, + "2274": { + "qualifiedName": "__init__", + "sourceFileName": "/crawlee/crawlers/_parsel/_parsel_crawler.py" + }, + "2277": { + "qualifiedName": "NoParser", + "sourceFileName": "/crawlee/crawlers/_http/_http_parser.py" + }, + "2278": { + "qualifiedName": "parse", + "sourceFileName": "/crawlee/crawlers/_http/_http_parser.py" + }, + "2281": { + "qualifiedName": "parse_text", + "sourceFileName": "/crawlee/crawlers/_http/_http_parser.py" + }, + "2284": { + "qualifiedName": "select", + "sourceFileName": "/crawlee/crawlers/_http/_http_parser.py" + }, + "2288": { + "qualifiedName": "is_blocked", + "sourceFileName": "/crawlee/crawlers/_http/_http_parser.py" + }, + "2291": { + "qualifiedName": "is_matching_selector", + "sourceFileName": "/crawlee/crawlers/_http/_http_parser.py" + }, + "2295": { + "qualifiedName": "find_links", + "sourceFileName": "/crawlee/crawlers/_http/_http_parser.py" + }, + "2299": { + "qualifiedName": "HttpCrawler", + "sourceFileName": "/crawlee/crawlers/_http/_http_crawler.py" + }, + "2300": { + "qualifiedName": "__init__", + "sourceFileName": "/crawlee/crawlers/_http/_http_crawler.py" + }, + "2303": { + "qualifiedName": "html_to_text", + "sourceFileName": "/crawlee/crawlers/_beautifulsoup/_utils.py" + }, + "2306": { + "qualifiedName": "BeautifulSoupParser", + "sourceFileName": "/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py" + }, + "2307": { + "qualifiedName": "__init__", + "sourceFileName": "/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py" + }, + "2310": { + "qualifiedName": "parse", + "sourceFileName": "/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py" + }, + "2313": { + "qualifiedName": "parse_text", + "sourceFileName": "/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py" + }, + "2316": { + "qualifiedName": "is_matching_selector", + "sourceFileName": "/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py" + }, + "2320": { + "qualifiedName": "select", + "sourceFileName": "/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py" + }, + "2324": { + "qualifiedName": "find_links", + "sourceFileName": "/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py" + }, + "2328": { + "qualifiedName": "BeautifulSoupParserType", + "sourceFileName": "/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py" + }, + "2329": { + "qualifiedName": "BeautifulSoupCrawlingContext", + "sourceFileName": "/crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawling_context.py" + }, + "2330": { + "qualifiedName": "soup", + "sourceFileName": "/crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawling_context.py" + }, + "2331": { + "qualifiedName": "from_parsed_http_crawling_context", + "sourceFileName": "/crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawling_context.py" + }, + "2334": { + "qualifiedName": "html_to_text", + "sourceFileName": "/crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawling_context.py" + }, + "2336": { + "qualifiedName": "BeautifulSoupCrawler", + "sourceFileName": "/crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py" + }, + "2337": { + "qualifiedName": "__init__", + "sourceFileName": "/crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py" + }, + "2341": { + "qualifiedName": "reduce_asyncio_timeout_error_to_relevant_traceback_parts", + "sourceFileName": "/crawlee/crawlers/_basic/_logging_utils.py" + }, + "2344": { + "qualifiedName": "get_one_line_error_summary_if_possible", + "sourceFileName": "/crawlee/crawlers/_basic/_logging_utils.py" + }, + "2347": { + "qualifiedName": "TCrawlingContext", + "sourceFileName": "/crawlee/crawlers/_basic/_context_pipeline.py" + }, + "2348": { + "qualifiedName": "TMiddlewareCrawlingContext", + "sourceFileName": "/crawlee/crawlers/_basic/_context_pipeline.py" + }, + "2349": { + "qualifiedName": "ContextPipeline", + "sourceFileName": "/crawlee/crawlers/_basic/_context_pipeline.py" + }, + "2350": { + "qualifiedName": "__init__", + "sourceFileName": "/crawlee/crawlers/_basic/_context_pipeline.py" + }, + "2354": { + "qualifiedName": "__call__", + "sourceFileName": "/crawlee/crawlers/_basic/_context_pipeline.py" + }, + "2358": { + "qualifiedName": "compose", + "sourceFileName": "/crawlee/crawlers/_basic/_context_pipeline.py" + }, + "2361": { + "qualifiedName": "TCrawlingContext", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "2362": { + "qualifiedName": "TStatisticsState", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "2363": { + "qualifiedName": "TRequestIterator", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "2364": { + "qualifiedName": "ErrorHandler", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "2365": { + "qualifiedName": "FailedRequestHandler", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "2366": { + "qualifiedName": "SkippedRequestCallback", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "2367": { + "qualifiedName": "_BasicCrawlerOptions", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "2368": { + "qualifiedName": "configuration", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "2369": { + "qualifiedName": "event_manager", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "2370": { + "qualifiedName": "storage_client", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "2371": { + "qualifiedName": "request_manager", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "2372": { + "qualifiedName": "session_pool", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "2373": { + "qualifiedName": "proxy_configuration", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "2374": { + "qualifiedName": "http_client", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "2375": { + "qualifiedName": "max_request_retries", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "2376": { + "qualifiedName": "max_requests_per_crawl", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "2377": { + "qualifiedName": "max_session_rotations", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "2378": { + "qualifiedName": "max_crawl_depth", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "2379": { + "qualifiedName": "use_session_pool", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "2380": { + "qualifiedName": "retry_on_blocked", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "2381": { + "qualifiedName": "concurrency_settings", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "2382": { + "qualifiedName": "request_handler_timeout", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "2383": { + "qualifiedName": "abort_on_error", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "2384": { + "qualifiedName": "configure_logging", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "2385": { + "qualifiedName": "statistics_log_format", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "2386": { + "qualifiedName": "keep_alive", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "2387": { + "qualifiedName": "additional_http_error_status_codes", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "2388": { + "qualifiedName": "ignore_http_error_status_codes", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "2389": { + "qualifiedName": "respect_robots_txt_file", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "2390": { + "qualifiedName": "_BasicCrawlerOptionsGeneric", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "2391": { + "qualifiedName": "request_handler", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "2392": { + "qualifiedName": "statistics", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "2393": { + "qualifiedName": "BasicCrawlerOptions", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "2394": { + "qualifiedName": "BasicCrawler", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "2395": { + "qualifiedName": "__init__", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "2424": { + "qualifiedName": "log", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "2425": { + "qualifiedName": "router", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "2426": { + "qualifiedName": "router", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "2429": { + "qualifiedName": "statistics", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "2430": { + "qualifiedName": "stop", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "2433": { + "qualifiedName": "get_request_manager", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "2435": { + "qualifiedName": "get_dataset", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "2439": { + "qualifiedName": "get_key_value_store", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "2443": { + "qualifiedName": "error_handler", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "2446": { + "qualifiedName": "failed_request_handler", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "2449": { + "qualifiedName": "on_skipped_request", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "2452": { + "qualifiedName": "run", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "2456": { + "qualifiedName": "add_requests", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "2463": { + "qualifiedName": "get_data", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "2468": { + "qualifiedName": "export_data", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "2473": { + "qualifiedName": "export_data_csv", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "2479": { + "qualifiedName": "export_data_json", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "2485": { + "qualifiedName": "create_default_comparator", + "sourceFileName": "/crawlee/crawlers/_adaptive_playwright/_result_comparator.py" + }, + "2488": { + "qualifiedName": "full_result_comparator", + "sourceFileName": "/crawlee/crawlers/_adaptive_playwright/_result_comparator.py" + }, + "2492": { + "qualifiedName": "push_data_only_comparator", + "sourceFileName": "/crawlee/crawlers/_adaptive_playwright/_result_comparator.py" + }, + "2496": { + "qualifiedName": "UrlComponents", + "sourceFileName": "/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py" + }, + "2497": { + "qualifiedName": "RenderingType", + "sourceFileName": "/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py" + }, + "2498": { + "qualifiedName": "FeatureVector", + "sourceFileName": "/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py" + }, + "2499": { + "qualifiedName": "RenderingTypePrediction", + "sourceFileName": "/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py" + }, + "2500": { + "qualifiedName": "rendering_type", + "sourceFileName": "/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py" + }, + "2501": { + "qualifiedName": "detection_probability_recommendation", + "sourceFileName": "/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py" + }, + "2502": { + "qualifiedName": "RenderingTypePredictor", + "sourceFileName": "/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py" + }, + "2503": { + "qualifiedName": "predict", + "sourceFileName": "/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py" + }, + "2506": { + "qualifiedName": "store_result", + "sourceFileName": "/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py" + }, + "2510": { + "qualifiedName": "DefaultRenderingTypePredictor", + "sourceFileName": "/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py" + }, + "2511": { + "qualifiedName": "__init__", + "sourceFileName": "/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py" + }, + "2514": { + "qualifiedName": "predict", + "sourceFileName": "/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py" + }, + "2517": { + "qualifiedName": "store_result", + "sourceFileName": "/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py" + }, + "2521": { + "qualifiedName": "get_url_components", + "sourceFileName": "/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py" + }, + "2524": { + "qualifiedName": "calculate_url_similarity", + "sourceFileName": "/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py" + }, + "2528": { + "qualifiedName": "TStaticParseResult", + "sourceFileName": "/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py" + }, + "2529": { + "qualifiedName": "TStaticSelectResult", + "sourceFileName": "/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py" + }, + "2530": { + "qualifiedName": "AdaptiveContextError", + "sourceFileName": "/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py" + }, + "2531": { + "qualifiedName": "AdaptivePlaywrightCrawlingContext", + "sourceFileName": "/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py" + }, + "2532": { + "qualifiedName": "page", + "sourceFileName": "/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py" + }, + "2533": { + "qualifiedName": "infinite_scroll", + "sourceFileName": "/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py" + }, + "2534": { + "qualifiedName": "response", + "sourceFileName": "/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py" + }, + "2535": { + "qualifiedName": "wait_for_selector", + "sourceFileName": "/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py" + }, + "2539": { + "qualifiedName": "query_selector_one", + "sourceFileName": "/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py" + }, + "2543": { + "qualifiedName": "query_selector_all", + "sourceFileName": "/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py" + }, + "2547": { + "qualifiedName": "parse_with_static_parser", + "sourceFileName": "/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py" + }, + "2551": { + "qualifiedName": "from_parsed_http_crawling_context", + "sourceFileName": "/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py" + }, + "2555": { + "qualifiedName": "from_playwright_crawling_context", + "sourceFileName": "/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py" + }, + "2559": { + "qualifiedName": "AdaptivePlaywrightPreNavCrawlingContext", + "sourceFileName": "/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py" + }, + "2560": { + "qualifiedName": "block_requests", + "sourceFileName": "/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py" + }, + "2561": { + "qualifiedName": "page", + "sourceFileName": "/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py" + }, + "2562": { + "qualifiedName": "from_pre_navigation_context", + "sourceFileName": "/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py" + }, + "2565": { + "qualifiedName": "AdaptivePlaywrightCrawlerStatisticState", + "sourceFileName": "/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py" + }, + "2566": { + "qualifiedName": "model_config", + "sourceFileName": "/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py" + }, + "2567": { + "qualifiedName": "http_only_request_handler_runs", + "sourceFileName": "/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py" + }, + "2568": { + "qualifiedName": "browser_request_handler_runs", + "sourceFileName": "/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py" + }, + "2569": { + "qualifiedName": "rendering_type_mispredictions", + "sourceFileName": "/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py" + }, + "2570": { + "qualifiedName": "TStaticParseResult", + "sourceFileName": "/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py" + }, + "2571": { + "qualifiedName": "TStaticSelectResult", + "sourceFileName": "/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py" + }, + "2572": { + "qualifiedName": "TStaticCrawlingContext", + "sourceFileName": "/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py" + }, + "2573": { + "qualifiedName": "_NonPersistentStatistics", + "sourceFileName": "/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py" + }, + "2574": { + "qualifiedName": "__init__", + "sourceFileName": "/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py" + }, + "2576": { + "qualifiedName": "__aenter__", + "sourceFileName": "/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py" + }, + "2578": { + "qualifiedName": "__aexit__", + "sourceFileName": "/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py" + }, + "2583": { + "qualifiedName": "AdaptivePlaywrightCrawler", + "sourceFileName": "/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py" + }, + "2584": { + "qualifiedName": "__init__", + "sourceFileName": "/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py" + }, + "2593": { + "qualifiedName": "with_beautifulsoup_static_parser", + "sourceFileName": "/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py" + }, + "2602": { + "qualifiedName": "with_parsel_static_parser", + "sourceFileName": "/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py" + }, + "2610": { + "qualifiedName": "pre_navigation_hook", + "sourceFileName": "/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py" + }, + "2614": { + "qualifiedName": "track_http_only_request_handler_runs", + "sourceFileName": "/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py" + }, + "2616": { + "qualifiedName": "track_browser_request_handler_runs", + "sourceFileName": "/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py" + }, + "2618": { + "qualifiedName": "track_rendering_type_mispredictions", + "sourceFileName": "/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py" + }, + "2620": { + "qualifiedName": "SubCrawlerRun", + "sourceFileName": "/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py" + }, + "2621": { + "qualifiedName": "result", + "sourceFileName": "/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py" + }, + "2622": { + "qualifiedName": "exception", + "sourceFileName": "/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py" + }, + "2623": { + "qualifiedName": "TParseResult", + "sourceFileName": "/crawlee/crawlers/_abstract_http/_http_crawling_context.py" + }, + "2624": { + "qualifiedName": "TSelectResult", + "sourceFileName": "/crawlee/crawlers/_abstract_http/_http_crawling_context.py" + }, + "2625": { + "qualifiedName": "HttpCrawlingContext", + "sourceFileName": "/crawlee/crawlers/_abstract_http/_http_crawling_context.py" + }, + "2626": { + "qualifiedName": "from_basic_crawling_context", + "sourceFileName": "/crawlee/crawlers/_abstract_http/_http_crawling_context.py" + }, + "2630": { + "qualifiedName": "get_snapshot", + "sourceFileName": "/crawlee/crawlers/_abstract_http/_http_crawling_context.py" + }, + "2632": { + "qualifiedName": "ParsedHttpCrawlingContext", + "sourceFileName": "/crawlee/crawlers/_abstract_http/_http_crawling_context.py" + }, + "2633": { + "qualifiedName": "parsed_content", + "sourceFileName": "/crawlee/crawlers/_abstract_http/_http_crawling_context.py" + }, + "2634": { + "qualifiedName": "enqueue_links", + "sourceFileName": "/crawlee/crawlers/_abstract_http/_http_crawling_context.py" + }, + "2635": { + "qualifiedName": "extract_links", + "sourceFileName": "/crawlee/crawlers/_abstract_http/_http_crawling_context.py" + }, + "2636": { + "qualifiedName": "from_http_crawling_context", + "sourceFileName": "/crawlee/crawlers/_abstract_http/_http_crawling_context.py" + }, + "2642": { + "qualifiedName": "AbstractHttpParser", + "sourceFileName": "/crawlee/crawlers/_abstract_http/_abstract_http_parser.py" + }, + "2643": { + "qualifiedName": "parse", + "sourceFileName": "/crawlee/crawlers/_abstract_http/_abstract_http_parser.py" + }, + "2646": { + "qualifiedName": "parse_text", + "sourceFileName": "/crawlee/crawlers/_abstract_http/_abstract_http_parser.py" + }, + "2649": { + "qualifiedName": "select", + "sourceFileName": "/crawlee/crawlers/_abstract_http/_abstract_http_parser.py" + }, + "2653": { + "qualifiedName": "is_blocked", + "sourceFileName": "/crawlee/crawlers/_abstract_http/_abstract_http_parser.py" + }, + "2656": { + "qualifiedName": "is_matching_selector", + "sourceFileName": "/crawlee/crawlers/_abstract_http/_abstract_http_parser.py" + }, + "2660": { + "qualifiedName": "find_links", + "sourceFileName": "/crawlee/crawlers/_abstract_http/_abstract_http_parser.py" + }, + "2664": { + "qualifiedName": "TCrawlingContext", + "sourceFileName": "/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py" + }, + "2665": { + "qualifiedName": "TStatisticsState", + "sourceFileName": "/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py" + }, + "2666": { + "qualifiedName": "AbstractHttpCrawler", + "sourceFileName": "/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py" + }, + "2667": { + "qualifiedName": "__init__", + "sourceFileName": "/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py" + }, + "2671": { + "qualifiedName": "create_parsed_http_crawler_class", + "sourceFileName": "/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py" + }, + "2674": { + "qualifiedName": "pre_navigation_hook", + "sourceFileName": "/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py" + }, + "2677": { + "qualifiedName": "BrowserType", + "sourceFileName": "/crawlee/browsers/_types.py" + }, + "2678": { + "qualifiedName": "CrawleePage", + "sourceFileName": "/crawlee/browsers/_types.py" + }, + "2679": { + "qualifiedName": "id", + "sourceFileName": "/crawlee/browsers/_types.py" + }, + "2680": { + "qualifiedName": "browser_type", + "sourceFileName": "/crawlee/browsers/_types.py" + }, + "2681": { + "qualifiedName": "page", + "sourceFileName": "/crawlee/browsers/_types.py" + }, + "2682": { + "qualifiedName": "logger", + "sourceFileName": "/crawlee/browsers/_playwright_browser_plugin.py" + }, + "2683": { + "qualifiedName": "PlaywrightBrowserPlugin", + "sourceFileName": "/crawlee/browsers/_playwright_browser_plugin.py" + }, + "2684": { + "qualifiedName": "AUTOMATION_LIBRARY", + "sourceFileName": "/crawlee/browsers/_playwright_browser_plugin.py" + }, + "2685": { + "qualifiedName": "__init__", + "sourceFileName": "/crawlee/browsers/_playwright_browser_plugin.py" + }, + "2694": { + "qualifiedName": "active", + "sourceFileName": "/crawlee/browsers/_playwright_browser_plugin.py" + }, + "2695": { + "qualifiedName": "browser_type", + "sourceFileName": "/crawlee/browsers/_playwright_browser_plugin.py" + }, + "2696": { + "qualifiedName": "browser_launch_options", + "sourceFileName": "/crawlee/browsers/_playwright_browser_plugin.py" + }, + "2697": { + "qualifiedName": "browser_new_context_options", + "sourceFileName": "/crawlee/browsers/_playwright_browser_plugin.py" + }, + "2698": { + "qualifiedName": "max_open_pages_per_browser", + "sourceFileName": "/crawlee/browsers/_playwright_browser_plugin.py" + }, + "2699": { + "qualifiedName": "__aenter__", + "sourceFileName": "/crawlee/browsers/_playwright_browser_plugin.py" + }, + "2701": { + "qualifiedName": "__aexit__", + "sourceFileName": "/crawlee/browsers/_playwright_browser_plugin.py" + }, + "2706": { + "qualifiedName": "new_browser", + "sourceFileName": "/crawlee/browsers/_playwright_browser_plugin.py" + }, + "2708": { + "qualifiedName": "logger", + "sourceFileName": "/crawlee/browsers/_playwright_browser_controller.py" + }, + "2709": { + "qualifiedName": "PlaywrightBrowserController", + "sourceFileName": "/crawlee/browsers/_playwright_browser_controller.py" + }, + "2710": { + "qualifiedName": "AUTOMATION_LIBRARY", + "sourceFileName": "/crawlee/browsers/_playwright_browser_controller.py" + }, + "2711": { + "qualifiedName": "__init__", + "sourceFileName": "/crawlee/browsers/_playwright_browser_controller.py" + }, + "2718": { + "qualifiedName": "pages", + "sourceFileName": "/crawlee/browsers/_playwright_browser_controller.py" + }, + "2719": { + "qualifiedName": "total_opened_pages", + "sourceFileName": "/crawlee/browsers/_playwright_browser_controller.py" + }, + "2720": { + "qualifiedName": "pages_count", + "sourceFileName": "/crawlee/browsers/_playwright_browser_controller.py" + }, + "2721": { + "qualifiedName": "last_page_opened_at", + "sourceFileName": "/crawlee/browsers/_playwright_browser_controller.py" + }, + "2722": { + "qualifiedName": "idle_time", + "sourceFileName": "/crawlee/browsers/_playwright_browser_controller.py" + }, + "2723": { + "qualifiedName": "has_free_capacity", + "sourceFileName": "/crawlee/browsers/_playwright_browser_controller.py" + }, + "2724": { + "qualifiedName": "is_browser_connected", + "sourceFileName": "/crawlee/browsers/_playwright_browser_controller.py" + }, + "2725": { + "qualifiedName": "browser_type", + "sourceFileName": "/crawlee/browsers/_playwright_browser_controller.py" + }, + "2726": { + "qualifiedName": "new_page", + "sourceFileName": "/crawlee/browsers/_playwright_browser_controller.py" + }, + "2730": { + "qualifiedName": "close", + "sourceFileName": "/crawlee/browsers/_playwright_browser_controller.py" + }, + "2733": { + "qualifiedName": "logger", + "sourceFileName": "/crawlee/browsers/_playwright_browser.py" + }, + "2734": { + "qualifiedName": "PlaywrightPersistentBrowser", + "sourceFileName": "/crawlee/browsers/_playwright_browser.py" + }, + "2735": { + "qualifiedName": "__init__", + "sourceFileName": "/crawlee/browsers/_playwright_browser.py" + }, + "2740": { + "qualifiedName": "browser_type", + "sourceFileName": "/crawlee/browsers/_playwright_browser.py" + }, + "2741": { + "qualifiedName": "contexts", + "sourceFileName": "/crawlee/browsers/_playwright_browser.py" + }, + "2742": { + "qualifiedName": "is_connected", + "sourceFileName": "/crawlee/browsers/_playwright_browser.py" + }, + "2744": { + "qualifiedName": "new_context", + "sourceFileName": "/crawlee/browsers/_playwright_browser.py" + }, + "2747": { + "qualifiedName": "close", + "sourceFileName": "/crawlee/browsers/_playwright_browser.py" + }, + "2750": { + "qualifiedName": "version", + "sourceFileName": "/crawlee/browsers/_playwright_browser.py" + }, + "2751": { + "qualifiedName": "new_page", + "sourceFileName": "/crawlee/browsers/_playwright_browser.py" + }, + "2754": { + "qualifiedName": "new_browser_cdp_session", + "sourceFileName": "/crawlee/browsers/_playwright_browser.py" + }, + "2756": { + "qualifiedName": "start_tracing", + "sourceFileName": "/crawlee/browsers/_playwright_browser.py" + }, + "2759": { + "qualifiedName": "stop_tracing", + "sourceFileName": "/crawlee/browsers/_playwright_browser.py" + }, + "2762": { + "qualifiedName": "logger", + "sourceFileName": "/crawlee/browsers/_browser_pool.py" + }, + "2763": { + "qualifiedName": "BrowserPool", + "sourceFileName": "/crawlee/browsers/_browser_pool.py" + }, + "2764": { + "qualifiedName": "__init__", + "sourceFileName": "/crawlee/browsers/_browser_pool.py" + }, + "2772": { + "qualifiedName": "with_default_plugin", + "sourceFileName": "/crawlee/browsers/_browser_pool.py" + }, + "2782": { + "qualifiedName": "plugins", + "sourceFileName": "/crawlee/browsers/_browser_pool.py" + }, + "2783": { + "qualifiedName": "active_browsers", + "sourceFileName": "/crawlee/browsers/_browser_pool.py" + }, + "2784": { + "qualifiedName": "inactive_browsers", + "sourceFileName": "/crawlee/browsers/_browser_pool.py" + }, + "2785": { + "qualifiedName": "pages", + "sourceFileName": "/crawlee/browsers/_browser_pool.py" + }, + "2786": { + "qualifiedName": "total_pages_count", + "sourceFileName": "/crawlee/browsers/_browser_pool.py" + }, + "2787": { + "qualifiedName": "active", + "sourceFileName": "/crawlee/browsers/_browser_pool.py" + }, + "2788": { + "qualifiedName": "__aenter__", + "sourceFileName": "/crawlee/browsers/_browser_pool.py" + }, + "2790": { + "qualifiedName": "__aexit__", + "sourceFileName": "/crawlee/browsers/_browser_pool.py" + }, + "2795": { + "qualifiedName": "new_page", + "sourceFileName": "/crawlee/browsers/_browser_pool.py" + }, + "2800": { + "qualifiedName": "new_page_with_each_plugin", + "sourceFileName": "/crawlee/browsers/_browser_pool.py" + }, + "2802": { + "qualifiedName": "BrowserPlugin", + "sourceFileName": "/crawlee/browsers/_browser_plugin.py" + }, + "2803": { + "qualifiedName": "AUTOMATION_LIBRARY", + "sourceFileName": "/crawlee/browsers/_browser_plugin.py" + }, + "2804": { + "qualifiedName": "active", + "sourceFileName": "/crawlee/browsers/_browser_plugin.py" + }, + "2805": { + "qualifiedName": "browser_type", + "sourceFileName": "/crawlee/browsers/_browser_plugin.py" + }, + "2806": { + "qualifiedName": "browser_launch_options", + "sourceFileName": "/crawlee/browsers/_browser_plugin.py" + }, + "2807": { + "qualifiedName": "browser_new_context_options", + "sourceFileName": "/crawlee/browsers/_browser_plugin.py" + }, + "2808": { + "qualifiedName": "max_open_pages_per_browser", + "sourceFileName": "/crawlee/browsers/_browser_plugin.py" + }, + "2809": { + "qualifiedName": "__aenter__", + "sourceFileName": "/crawlee/browsers/_browser_plugin.py" + }, + "2811": { + "qualifiedName": "__aexit__", + "sourceFileName": "/crawlee/browsers/_browser_plugin.py" + }, + "2816": { + "qualifiedName": "new_browser", + "sourceFileName": "/crawlee/browsers/_browser_plugin.py" + }, + "2818": { + "qualifiedName": "BrowserController", + "sourceFileName": "/crawlee/browsers/_browser_controller.py" + }, + "2819": { + "qualifiedName": "AUTOMATION_LIBRARY", + "sourceFileName": "/crawlee/browsers/_browser_controller.py" + }, + "2820": { + "qualifiedName": "pages", + "sourceFileName": "/crawlee/browsers/_browser_controller.py" + }, + "2821": { + "qualifiedName": "total_opened_pages", + "sourceFileName": "/crawlee/browsers/_browser_controller.py" + }, + "2822": { + "qualifiedName": "pages_count", + "sourceFileName": "/crawlee/browsers/_browser_controller.py" + }, + "2823": { + "qualifiedName": "last_page_opened_at", + "sourceFileName": "/crawlee/browsers/_browser_controller.py" + }, + "2824": { + "qualifiedName": "idle_time", + "sourceFileName": "/crawlee/browsers/_browser_controller.py" + }, + "2825": { + "qualifiedName": "has_free_capacity", + "sourceFileName": "/crawlee/browsers/_browser_controller.py" + }, + "2826": { + "qualifiedName": "is_browser_connected", + "sourceFileName": "/crawlee/browsers/_browser_controller.py" + }, + "2827": { + "qualifiedName": "browser_type", + "sourceFileName": "/crawlee/browsers/_browser_controller.py" + }, + "2828": { + "qualifiedName": "new_page", + "sourceFileName": "/crawlee/browsers/_browser_controller.py" + }, + "2832": { + "qualifiedName": "close", + "sourceFileName": "/crawlee/browsers/_browser_controller.py" + }, + "2835": { + "qualifiedName": "is_status_code_client_error", + "sourceFileName": "/crawlee/_utils/web.py" + }, + "2838": { + "qualifiedName": "is_status_code_server_error", + "sourceFileName": "/crawlee/_utils/web.py" + }, + "2841": { + "qualifiedName": "T", + "sourceFileName": "/crawlee/_utils/wait.py" + }, + "2842": { + "qualifiedName": "wait_for", + "sourceFileName": "/crawlee/_utils/wait.py" + }, + "2849": { + "qualifiedName": "wait_for_all_tasks_for_finish", + "sourceFileName": "/crawlee/_utils/wait.py" + }, + "2854": { + "qualifiedName": "is_url_absolute", + "sourceFileName": "/crawlee/_utils/urls.py" + }, + "2857": { + "qualifiedName": "convert_to_absolute_url", + "sourceFileName": "/crawlee/_utils/urls.py" + }, + "2861": { + "qualifiedName": "to_absolute_url_iterator", + "sourceFileName": "/crawlee/_utils/urls.py" + }, + "2865": { + "qualifiedName": "validate_http_url", + "sourceFileName": "/crawlee/_utils/urls.py" + }, + "2868": { + "qualifiedName": "try_import", + "sourceFileName": "/crawlee/_utils/try_import.py" + }, + "2872": { + "qualifiedName": "install_import_hook", + "sourceFileName": "/crawlee/_utils/try_import.py" + }, + "2875": { + "qualifiedName": "FailedImport", + "sourceFileName": "/crawlee/_utils/try_import.py" + }, + "2876": { + "qualifiedName": "message", + "sourceFileName": "/crawlee/_utils/try_import.py" + }, + "2877": { + "qualifiedName": "ImportWrapper", + "sourceFileName": "/crawlee/_utils/try_import.py" + }, + "2878": { + "qualifiedName": "__getattribute__", + "sourceFileName": "/crawlee/_utils/try_import.py" + }, + "2881": { + "qualifiedName": "logger", + "sourceFileName": "/crawlee/_utils/system.py" + }, + "2882": { + "qualifiedName": "CpuInfo", + "sourceFileName": "/crawlee/_utils/system.py" + }, + "2883": { + "qualifiedName": "model_config", + "sourceFileName": "/crawlee/_utils/system.py" + }, + "2884": { + "qualifiedName": "used_ratio", + "sourceFileName": "/crawlee/_utils/system.py" + }, + "2885": { + "qualifiedName": "created_at", + "sourceFileName": "/crawlee/_utils/system.py" + }, + "2886": { + "qualifiedName": "MemoryUsageInfo", + "sourceFileName": "/crawlee/_utils/system.py" + }, + "2887": { + "qualifiedName": "model_config", + "sourceFileName": "/crawlee/_utils/system.py" + }, + "2888": { + "qualifiedName": "current_size", + "sourceFileName": "/crawlee/_utils/system.py" + }, + "2889": { + "qualifiedName": "created_at", + "sourceFileName": "/crawlee/_utils/system.py" + }, + "2890": { + "qualifiedName": "MemoryInfo", + "sourceFileName": "/crawlee/_utils/system.py" + }, + "2891": { + "qualifiedName": "model_config", + "sourceFileName": "/crawlee/_utils/system.py" + }, + "2892": { + "qualifiedName": "total_size", + "sourceFileName": "/crawlee/_utils/system.py" + }, + "2893": { + "qualifiedName": "get_cpu_info", + "sourceFileName": "/crawlee/_utils/system.py" + }, + "2895": { + "qualifiedName": "get_memory_info", + "sourceFileName": "/crawlee/_utils/system.py" + }, + "2897": { + "qualifiedName": "RobotsTxtFile", + "sourceFileName": "/crawlee/_utils/robots.py" + }, + "2898": { + "qualifiedName": "__init__", + "sourceFileName": "/crawlee/_utils/robots.py" + }, + "2902": { + "qualifiedName": "from_content", + "sourceFileName": "/crawlee/_utils/robots.py" + }, + "2906": { + "qualifiedName": "find", + "sourceFileName": "/crawlee/_utils/robots.py" + }, + "2911": { + "qualifiedName": "load", + "sourceFileName": "/crawlee/_utils/robots.py" + }, + "2916": { + "qualifiedName": "is_allowed", + "sourceFileName": "/crawlee/_utils/robots.py" + }, + "2920": { + "qualifiedName": "get_sitemaps", + "sourceFileName": "/crawlee/_utils/robots.py" + }, + "2922": { + "qualifiedName": "get_crawl_delay", + "sourceFileName": "/crawlee/_utils/robots.py" + }, + "2925": { + "qualifiedName": "logger", + "sourceFileName": "/crawlee/_utils/requests.py" + }, + "2926": { + "qualifiedName": "unique_key_to_request_id", + "sourceFileName": "/crawlee/_utils/requests.py" + }, + "2930": { + "qualifiedName": "normalize_url", + "sourceFileName": "/crawlee/_utils/requests.py" + }, + "2934": { + "qualifiedName": "compute_unique_key", + "sourceFileName": "/crawlee/_utils/requests.py" + }, + "2943": { + "qualifiedName": "logger", + "sourceFileName": "/crawlee/_utils/recurring_task.py" + }, + "2944": { + "qualifiedName": "RecurringTask", + "sourceFileName": "/crawlee/_utils/recurring_task.py" + }, + "2945": { + "qualifiedName": "__init__", + "sourceFileName": "/crawlee/_utils/recurring_task.py" + }, + "2949": { + "qualifiedName": "start", + "sourceFileName": "/crawlee/_utils/recurring_task.py" + }, + "2951": { + "qualifiedName": "stop", + "sourceFileName": "/crawlee/_utils/recurring_task.py" + }, + "2953": { + "qualifiedName": "TStateModel", + "sourceFileName": "/crawlee/_utils/recoverable_state.py" + }, + "2954": { + "qualifiedName": "RecoverableState", + "sourceFileName": "/crawlee/_utils/recoverable_state.py" + }, + "2955": { + "qualifiedName": "__init__", + "sourceFileName": "/crawlee/_utils/recoverable_state.py" + }, + "2963": { + "qualifiedName": "initialize", + "sourceFileName": "/crawlee/_utils/recoverable_state.py" + }, + "2965": { + "qualifiedName": "teardown", + "sourceFileName": "/crawlee/_utils/recoverable_state.py" + }, + "2967": { + "qualifiedName": "current_value", + "sourceFileName": "/crawlee/_utils/recoverable_state.py" + }, + "2968": { + "qualifiedName": "reset", + "sourceFileName": "/crawlee/_utils/recoverable_state.py" + }, + "2970": { + "qualifiedName": "persist_state", + "sourceFileName": "/crawlee/_utils/recoverable_state.py" + }, + "2973": { + "qualifiedName": "timedelta_ms", + "sourceFileName": "/crawlee/_utils/models.py" + }, + "2974": { + "qualifiedName": "timedelta_secs", + "sourceFileName": "/crawlee/_utils/models.py" + }, + "2975": { + "qualifiedName": "TimerResult", + "sourceFileName": "/crawlee/_utils/measure_time.py" + }, + "2976": { + "qualifiedName": "wall", + "sourceFileName": "/crawlee/_utils/measure_time.py" + }, + "2977": { + "qualifiedName": "cpu", + "sourceFileName": "/crawlee/_utils/measure_time.py" + }, + "2978": { + "qualifiedName": "measure_time", + "sourceFileName": "/crawlee/_utils/measure_time.py" + }, + "2980": { + "qualifiedName": "SKIP_TAGS", + "sourceFileName": "/crawlee/_utils/html_to_text.py" + }, + "2981": { + "qualifiedName": "BLOCK_TAGS", + "sourceFileName": "/crawlee/_utils/html_to_text.py" + }, + "2982": { + "qualifiedName": "Glob", + "sourceFileName": "/crawlee/_utils/globs.py" + }, + "2983": { + "qualifiedName": "__init__", + "sourceFileName": "/crawlee/_utils/globs.py" + }, + "2986": { + "qualifiedName": "ContentType", + "sourceFileName": "/crawlee/_utils/file.py" + }, + "2987": { + "qualifiedName": "JSON", + "sourceFileName": "/crawlee/_utils/file.py" + }, + "2988": { + "qualifiedName": "TEXT", + "sourceFileName": "/crawlee/_utils/file.py" + }, + "2989": { + "qualifiedName": "XML", + "sourceFileName": "/crawlee/_utils/file.py" + }, + "2990": { + "qualifiedName": "matches", + "sourceFileName": "/crawlee/_utils/file.py" + }, + "2991": { + "qualifiedName": "is_content_type", + "sourceFileName": "/crawlee/_utils/file.py" + }, + "2995": { + "qualifiedName": "force_remove", + "sourceFileName": "/crawlee/_utils/file.py" + }, + "2998": { + "qualifiedName": "force_rename", + "sourceFileName": "/crawlee/_utils/file.py" + }, + "3002": { + "qualifiedName": "determine_file_extension", + "sourceFileName": "/crawlee/_utils/file.py" + }, + "3005": { + "qualifiedName": "is_file_or_bytes", + "sourceFileName": "/crawlee/_utils/file.py" + }, + "3008": { + "qualifiedName": "json_dumps", + "sourceFileName": "/crawlee/_utils/file.py" + }, + "3011": { + "qualifiedName": "GroupName", + "sourceFileName": "/crawlee/_utils/docs.py" + }, + "3012": { + "qualifiedName": "docs_group", + "sourceFileName": "/crawlee/_utils/docs.py" + }, + "3015": { + "qualifiedName": "maybe_extract_enum_member_value", + "sourceFileName": "/crawlee/_utils/data_processing.py" + }, + "3018": { + "qualifiedName": "maybe_parse_body", + "sourceFileName": "/crawlee/_utils/data_processing.py" + }, + "3022": { + "qualifiedName": "raise_on_duplicate_storage", + "sourceFileName": "/crawlee/_utils/data_processing.py" + }, + "3027": { + "qualifiedName": "raise_on_non_existing_storage", + "sourceFileName": "/crawlee/_utils/data_processing.py" + }, + "3031": { + "qualifiedName": "compute_short_hash", + "sourceFileName": "/crawlee/_utils/crypto.py" + }, + "3035": { + "qualifiedName": "crypto_random_object_id", + "sourceFileName": "/crawlee/_utils/crypto.py" + }, + "3038": { + "qualifiedName": "T", + "sourceFileName": "/crawlee/_utils/context.py" + }, + "3039": { + "qualifiedName": "ensure_context", + "sourceFileName": "/crawlee/_utils/context.py" + }, + "3042": { + "qualifiedName": "BORDER", + "sourceFileName": "/crawlee/_utils/console.py" + }, + "3043": { + "qualifiedName": "make_table", + "sourceFileName": "/crawlee/_utils/console.py" + }, + "3047": { + "qualifiedName": "ByteSize", + "sourceFileName": "/crawlee/_utils/byte_size.py" + }, + "3048": { + "qualifiedName": "bytes", + "sourceFileName": "/crawlee/_utils/byte_size.py" + }, + "3049": { + "qualifiedName": "__post_init__", + "sourceFileName": "/crawlee/_utils/byte_size.py" + }, + "3051": { + "qualifiedName": "validate", + "sourceFileName": "/crawlee/_utils/byte_size.py" + }, + "3054": { + "qualifiedName": "from_kb", + "sourceFileName": "/crawlee/_utils/byte_size.py" + }, + "3057": { + "qualifiedName": "from_mb", + "sourceFileName": "/crawlee/_utils/byte_size.py" + }, + "3060": { + "qualifiedName": "from_gb", + "sourceFileName": "/crawlee/_utils/byte_size.py" + }, + "3063": { + "qualifiedName": "from_tb", + "sourceFileName": "/crawlee/_utils/byte_size.py" + }, + "3066": { + "qualifiedName": "to_kb", + "sourceFileName": "/crawlee/_utils/byte_size.py" + }, + "3068": { + "qualifiedName": "to_mb", + "sourceFileName": "/crawlee/_utils/byte_size.py" + }, + "3070": { + "qualifiedName": "to_gb", + "sourceFileName": "/crawlee/_utils/byte_size.py" + }, + "3072": { + "qualifiedName": "to_tb", + "sourceFileName": "/crawlee/_utils/byte_size.py" + }, + "3074": { + "qualifiedName": "__str__", + "sourceFileName": "/crawlee/_utils/byte_size.py" + }, + "3076": { + "qualifiedName": "__eq__", + "sourceFileName": "/crawlee/_utils/byte_size.py" + }, + "3079": { + "qualifiedName": "__lt__", + "sourceFileName": "/crawlee/_utils/byte_size.py" + }, + "3082": { + "qualifiedName": "__le__", + "sourceFileName": "/crawlee/_utils/byte_size.py" + }, + "3085": { + "qualifiedName": "__gt__", + "sourceFileName": "/crawlee/_utils/byte_size.py" + }, + "3088": { + "qualifiedName": "__ge__", + "sourceFileName": "/crawlee/_utils/byte_size.py" + }, + "3091": { + "qualifiedName": "__add__", + "sourceFileName": "/crawlee/_utils/byte_size.py" + }, + "3094": { + "qualifiedName": "__sub__", + "sourceFileName": "/crawlee/_utils/byte_size.py" + }, + "3097": { + "qualifiedName": "__mul__", + "sourceFileName": "/crawlee/_utils/byte_size.py" + }, + "3100": { + "qualifiedName": "__truediv__", + "sourceFileName": "/crawlee/_utils/byte_size.py" + }, + "3103": { + "qualifiedName": "__rmul__", + "sourceFileName": "/crawlee/_utils/byte_size.py" + }, + "3106": { + "qualifiedName": "CLOUDFLARE_RETRY_CSS_SELECTORS", + "sourceFileName": "/crawlee/_utils/blocked.py" + }, + "3107": { + "qualifiedName": "RETRY_CSS_SELECTORS", + "sourceFileName": "/crawlee/_utils/blocked.py" + }, + "3108": { + "qualifiedName": "ROTATE_PROXY_ERRORS", + "sourceFileName": "/crawlee/_utils/blocked.py" + }, + "3109": { + "qualifiedName": "logger", + "sourceFileName": "/crawlee/_autoscaling/system_status.py" + }, + "3110": { + "qualifiedName": "SystemStatus", + "sourceFileName": "/crawlee/_autoscaling/system_status.py" + }, + "3111": { + "qualifiedName": "__init__", + "sourceFileName": "/crawlee/_autoscaling/system_status.py" + }, + "3119": { + "qualifiedName": "get_current_system_info", + "sourceFileName": "/crawlee/_autoscaling/system_status.py" + }, + "3121": { + "qualifiedName": "get_historical_system_info", + "sourceFileName": "/crawlee/_autoscaling/system_status.py" + }, + "3123": { + "qualifiedName": "logger", + "sourceFileName": "/crawlee/_autoscaling/snapshotter.py" + }, + "3124": { + "qualifiedName": "T", + "sourceFileName": "/crawlee/_autoscaling/snapshotter.py" + }, + "3125": { + "qualifiedName": "Snapshotter", + "sourceFileName": "/crawlee/_autoscaling/snapshotter.py" + }, + "3126": { + "qualifiedName": "__init__", + "sourceFileName": "/crawlee/_autoscaling/snapshotter.py" + }, + "3133": { + "qualifiedName": "from_config", + "sourceFileName": "/crawlee/_autoscaling/snapshotter.py" + }, + "3136": { + "qualifiedName": "active", + "sourceFileName": "/crawlee/_autoscaling/snapshotter.py" + }, + "3137": { + "qualifiedName": "__aenter__", + "sourceFileName": "/crawlee/_autoscaling/snapshotter.py" + }, + "3139": { + "qualifiedName": "__aexit__", + "sourceFileName": "/crawlee/_autoscaling/snapshotter.py" + }, + "3144": { + "qualifiedName": "get_memory_sample", + "sourceFileName": "/crawlee/_autoscaling/snapshotter.py" + }, + "3147": { + "qualifiedName": "get_event_loop_sample", + "sourceFileName": "/crawlee/_autoscaling/snapshotter.py" + }, + "3150": { + "qualifiedName": "get_cpu_sample", + "sourceFileName": "/crawlee/_autoscaling/snapshotter.py" + }, + "3153": { + "qualifiedName": "get_client_sample", + "sourceFileName": "/crawlee/_autoscaling/snapshotter.py" + }, + "3156": { + "qualifiedName": "logger", + "sourceFileName": "/crawlee/_autoscaling/autoscaled_pool.py" + }, + "3157": { + "qualifiedName": "AbortError", + "sourceFileName": "/crawlee/_autoscaling/autoscaled_pool.py" + }, + "3158": { + "qualifiedName": "_AutoscaledPoolRun", + "sourceFileName": "/crawlee/_autoscaling/autoscaled_pool.py" + }, + "3159": { + "qualifiedName": "__init__", + "sourceFileName": "/crawlee/_autoscaling/autoscaled_pool.py" + }, + "3161": { + "qualifiedName": "AutoscaledPool", + "sourceFileName": "/crawlee/_autoscaling/autoscaled_pool.py" + }, + "3162": { + "qualifiedName": "__init__", + "sourceFileName": "/crawlee/_autoscaling/autoscaled_pool.py" + }, + "3169": { + "qualifiedName": "run", + "sourceFileName": "/crawlee/_autoscaling/autoscaled_pool.py" + }, + "3171": { + "qualifiedName": "abort", + "sourceFileName": "/crawlee/_autoscaling/autoscaled_pool.py" + }, + "3173": { + "qualifiedName": "pause", + "sourceFileName": "/crawlee/_autoscaling/autoscaled_pool.py" + }, + "3175": { + "qualifiedName": "resume", + "sourceFileName": "/crawlee/_autoscaling/autoscaled_pool.py" + }, + "3177": { + "qualifiedName": "desired_concurrency", + "sourceFileName": "/crawlee/_autoscaling/autoscaled_pool.py" + }, + "3178": { + "qualifiedName": "current_concurrency", + "sourceFileName": "/crawlee/_autoscaling/autoscaled_pool.py" + }, + "3179": { + "qualifiedName": "LoadRatioInfo", + "sourceFileName": "/crawlee/_autoscaling/_types.py" + }, + "3180": { + "qualifiedName": "limit_ratio", + "sourceFileName": "/crawlee/_autoscaling/_types.py" + }, + "3181": { + "qualifiedName": "actual_ratio", + "sourceFileName": "/crawlee/_autoscaling/_types.py" + }, + "3182": { + "qualifiedName": "is_overloaded", + "sourceFileName": "/crawlee/_autoscaling/_types.py" + }, + "3183": { + "qualifiedName": "SystemInfo", + "sourceFileName": "/crawlee/_autoscaling/_types.py" + }, + "3184": { + "qualifiedName": "cpu_info", + "sourceFileName": "/crawlee/_autoscaling/_types.py" + }, + "3185": { + "qualifiedName": "memory_info", + "sourceFileName": "/crawlee/_autoscaling/_types.py" + }, + "3186": { + "qualifiedName": "event_loop_info", + "sourceFileName": "/crawlee/_autoscaling/_types.py" + }, + "3187": { + "qualifiedName": "client_info", + "sourceFileName": "/crawlee/_autoscaling/_types.py" + }, + "3188": { + "qualifiedName": "created_at", + "sourceFileName": "/crawlee/_autoscaling/_types.py" + }, + "3189": { + "qualifiedName": "is_system_idle", + "sourceFileName": "/crawlee/_autoscaling/_types.py" + }, + "3190": { + "qualifiedName": "__str__", + "sourceFileName": "/crawlee/_autoscaling/_types.py" + }, + "3192": { + "qualifiedName": "CpuSnapshot", + "sourceFileName": "/crawlee/_autoscaling/_types.py" + }, + "3193": { + "qualifiedName": "used_ratio", + "sourceFileName": "/crawlee/_autoscaling/_types.py" + }, + "3194": { + "qualifiedName": "max_used_ratio", + "sourceFileName": "/crawlee/_autoscaling/_types.py" + }, + "3195": { + "qualifiedName": "created_at", + "sourceFileName": "/crawlee/_autoscaling/_types.py" + }, + "3196": { + "qualifiedName": "is_overloaded", + "sourceFileName": "/crawlee/_autoscaling/_types.py" + }, + "3197": { + "qualifiedName": "MemorySnapshot", + "sourceFileName": "/crawlee/_autoscaling/_types.py" + }, + "3198": { + "qualifiedName": "current_size", + "sourceFileName": "/crawlee/_autoscaling/_types.py" + }, + "3199": { + "qualifiedName": "max_memory_size", + "sourceFileName": "/crawlee/_autoscaling/_types.py" + }, + "3200": { + "qualifiedName": "max_used_memory_ratio", + "sourceFileName": "/crawlee/_autoscaling/_types.py" + }, + "3201": { + "qualifiedName": "created_at", + "sourceFileName": "/crawlee/_autoscaling/_types.py" + }, + "3202": { + "qualifiedName": "is_overloaded", + "sourceFileName": "/crawlee/_autoscaling/_types.py" + }, + "3203": { + "qualifiedName": "EventLoopSnapshot", + "sourceFileName": "/crawlee/_autoscaling/_types.py" + }, + "3204": { + "qualifiedName": "delay", + "sourceFileName": "/crawlee/_autoscaling/_types.py" + }, + "3205": { + "qualifiedName": "max_delay", + "sourceFileName": "/crawlee/_autoscaling/_types.py" + }, + "3206": { + "qualifiedName": "created_at", + "sourceFileName": "/crawlee/_autoscaling/_types.py" + }, + "3207": { + "qualifiedName": "max_delay_exceeded", + "sourceFileName": "/crawlee/_autoscaling/_types.py" + }, + "3208": { + "qualifiedName": "is_overloaded", + "sourceFileName": "/crawlee/_autoscaling/_types.py" + }, + "3209": { + "qualifiedName": "ClientSnapshot", + "sourceFileName": "/crawlee/_autoscaling/_types.py" + }, + "3210": { + "qualifiedName": "error_count", + "sourceFileName": "/crawlee/_autoscaling/_types.py" + }, + "3211": { + "qualifiedName": "new_error_count", + "sourceFileName": "/crawlee/_autoscaling/_types.py" + }, + "3212": { + "qualifiedName": "max_error_count", + "sourceFileName": "/crawlee/_autoscaling/_types.py" + }, + "3213": { + "qualifiedName": "created_at", + "sourceFileName": "/crawlee/_autoscaling/_types.py" + }, + "3214": { + "qualifiedName": "is_overloaded", + "sourceFileName": "/crawlee/_autoscaling/_types.py" + }, + "3215": { + "qualifiedName": "Snapshot", + "sourceFileName": "/crawlee/_autoscaling/_types.py" + }, + "3216": { + "qualifiedName": "current_size", + "sourceFileName": "/crawlee/_utils/system.py" + }, + "3217": { + "qualifiedName": "created_at", + "sourceFileName": "/crawlee/_utils/system.py" + }, + "3218": { + "qualifiedName": "http_response", + "sourceFileName": "/crawlee/http_clients/_base.py" + }, + "3219": { + "qualifiedName": "replace_state_model", + "sourceFileName": "/crawlee/statistics/_statistics.py" + }, + "3220": { + "qualifiedName": "with_default_state", + "sourceFileName": "/crawlee/statistics/_statistics.py" + }, + "3221": { + "qualifiedName": "active", + "sourceFileName": "/crawlee/statistics/_statistics.py" + }, + "3222": { + "qualifiedName": "state", + "sourceFileName": "/crawlee/statistics/_statistics.py" + }, + "3223": { + "qualifiedName": "register_status_code", + "sourceFileName": "/crawlee/statistics/_statistics.py" + }, + "3224": { + "qualifiedName": "record_request_processing_start", + "sourceFileName": "/crawlee/statistics/_statistics.py" + }, + "3225": { + "qualifiedName": "record_request_processing_finish", + "sourceFileName": "/crawlee/statistics/_statistics.py" + }, + "3226": { + "qualifiedName": "record_request_processing_failure", + "sourceFileName": "/crawlee/statistics/_statistics.py" + }, + "3227": { + "qualifiedName": "calculate", + "sourceFileName": "/crawlee/statistics/_statistics.py" + }, + "3228": { + "qualifiedName": "reset", + "sourceFileName": "/crawlee/statistics/_statistics.py" + }, + "3229": { + "qualifiedName": "stats_id", + "sourceFileName": "/crawlee/statistics/_models.py" + }, + "3230": { + "qualifiedName": "requests_finished", + "sourceFileName": "/crawlee/statistics/_models.py" + }, + "3231": { + "qualifiedName": "requests_failed", + "sourceFileName": "/crawlee/statistics/_models.py" + }, + "3232": { + "qualifiedName": "requests_retries", + "sourceFileName": "/crawlee/statistics/_models.py" + }, + "3233": { + "qualifiedName": "requests_failed_per_minute", + "sourceFileName": "/crawlee/statistics/_models.py" + }, + "3234": { + "qualifiedName": "requests_finished_per_minute", + "sourceFileName": "/crawlee/statistics/_models.py" + }, + "3235": { + "qualifiedName": "request_min_duration", + "sourceFileName": "/crawlee/statistics/_models.py" + }, + "3236": { + "qualifiedName": "request_max_duration", + "sourceFileName": "/crawlee/statistics/_models.py" + }, + "3237": { + "qualifiedName": "request_total_failed_duration", + "sourceFileName": "/crawlee/statistics/_models.py" + }, + "3238": { + "qualifiedName": "request_total_finished_duration", + "sourceFileName": "/crawlee/statistics/_models.py" + }, + "3239": { + "qualifiedName": "crawler_started_at", + "sourceFileName": "/crawlee/statistics/_models.py" + }, + "3240": { + "qualifiedName": "crawler_last_started_at", + "sourceFileName": "/crawlee/statistics/_models.py" + }, + "3241": { + "qualifiedName": "crawler_finished_at", + "sourceFileName": "/crawlee/statistics/_models.py" + }, + "3242": { + "qualifiedName": "crawler_runtime", + "sourceFileName": "/crawlee/statistics/_models.py" + }, + "3243": { + "qualifiedName": "errors", + "sourceFileName": "/crawlee/statistics/_models.py" + }, + "3244": { + "qualifiedName": "retry_errors", + "sourceFileName": "/crawlee/statistics/_models.py" + }, + "3245": { + "qualifiedName": "requests_with_status_code", + "sourceFileName": "/crawlee/statistics/_models.py" + }, + "3246": { + "qualifiedName": "stats_persisted_at", + "sourceFileName": "/crawlee/statistics/_models.py" + }, + "3247": { + "qualifiedName": "request_retry_histogram", + "sourceFileName": "/crawlee/statistics/_models.py" + }, + "3248": { + "qualifiedName": "request_total_duration", + "sourceFileName": "/crawlee/statistics/_models.py" + }, + "3249": { + "qualifiedName": "request_avg_failed_duration", + "sourceFileName": "/crawlee/statistics/_models.py" + }, + "3250": { + "qualifiedName": "request_avg_finished_duration", + "sourceFileName": "/crawlee/statistics/_models.py" + }, + "3251": { + "qualifiedName": "requests_total", + "sourceFileName": "/crawlee/statistics/_models.py" + }, + "3252": { + "qualifiedName": "request_handler", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3253": { + "qualifiedName": "statistics", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3254": { + "qualifiedName": "configuration", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3255": { + "qualifiedName": "event_manager", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3256": { + "qualifiedName": "storage_client", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3257": { + "qualifiedName": "request_manager", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3258": { + "qualifiedName": "session_pool", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3259": { + "qualifiedName": "proxy_configuration", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3260": { + "qualifiedName": "http_client", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3261": { + "qualifiedName": "max_request_retries", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3262": { + "qualifiedName": "max_requests_per_crawl", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3263": { + "qualifiedName": "max_session_rotations", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3264": { + "qualifiedName": "max_crawl_depth", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3265": { + "qualifiedName": "use_session_pool", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3266": { + "qualifiedName": "retry_on_blocked", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3267": { + "qualifiedName": "concurrency_settings", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3268": { + "qualifiedName": "request_handler_timeout", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3269": { + "qualifiedName": "abort_on_error", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3270": { + "qualifiedName": "configure_logging", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3271": { + "qualifiedName": "statistics_log_format", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3272": { + "qualifiedName": "keep_alive", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3273": { + "qualifiedName": "additional_http_error_status_codes", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3274": { + "qualifiedName": "ignore_http_error_status_codes", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3275": { + "qualifiedName": "respect_robots_txt_file", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3276": { + "qualifiedName": "is_blocked", + "sourceFileName": "/crawlee/crawlers/_abstract_http/_abstract_http_parser.py" + }, + "3277": { + "qualifiedName": "is_blocked", + "sourceFileName": "/crawlee/crawlers/_abstract_http/_abstract_http_parser.py" + }, + "3278": { + "qualifiedName": "request_handler", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3279": { + "qualifiedName": "statistics", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3280": { + "qualifiedName": "configuration", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3281": { + "qualifiedName": "event_manager", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3282": { + "qualifiedName": "storage_client", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3283": { + "qualifiedName": "request_manager", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3284": { + "qualifiedName": "session_pool", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3285": { + "qualifiedName": "proxy_configuration", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3286": { + "qualifiedName": "http_client", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3287": { + "qualifiedName": "max_request_retries", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3288": { + "qualifiedName": "max_requests_per_crawl", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3289": { + "qualifiedName": "max_session_rotations", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3290": { + "qualifiedName": "max_crawl_depth", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3291": { + "qualifiedName": "use_session_pool", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3292": { + "qualifiedName": "retry_on_blocked", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3293": { + "qualifiedName": "concurrency_settings", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3294": { + "qualifiedName": "request_handler_timeout", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3295": { + "qualifiedName": "abort_on_error", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3296": { + "qualifiedName": "configure_logging", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3297": { + "qualifiedName": "statistics_log_format", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3298": { + "qualifiedName": "keep_alive", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3299": { + "qualifiedName": "additional_http_error_status_codes", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3300": { + "qualifiedName": "ignore_http_error_status_codes", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3301": { + "qualifiedName": "respect_robots_txt_file", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3302": { + "qualifiedName": "browser_pool", + "sourceFileName": "/crawlee/crawlers/_playwright/_playwright_crawler.py" + }, + "3303": { + "qualifiedName": "browser_type", + "sourceFileName": "/crawlee/crawlers/_playwright/_playwright_crawler.py" + }, + "3304": { + "qualifiedName": "browser_launch_options", + "sourceFileName": "/crawlee/crawlers/_playwright/_playwright_crawler.py" + }, + "3305": { + "qualifiedName": "browser_new_context_options", + "sourceFileName": "/crawlee/crawlers/_playwright/_playwright_crawler.py" + }, + "3306": { + "qualifiedName": "headless", + "sourceFileName": "/crawlee/crawlers/_playwright/_playwright_crawler.py" + }, + "3307": { + "qualifiedName": "log", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3308": { + "qualifiedName": "router", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3309": { + "qualifiedName": "statistics", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3310": { + "qualifiedName": "stop", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3311": { + "qualifiedName": "get_request_manager", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3312": { + "qualifiedName": "get_dataset", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3313": { + "qualifiedName": "get_key_value_store", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3314": { + "qualifiedName": "error_handler", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3315": { + "qualifiedName": "failed_request_handler", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3316": { + "qualifiedName": "on_skipped_request", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3317": { + "qualifiedName": "run", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3318": { + "qualifiedName": "add_requests", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3319": { + "qualifiedName": "get_data", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3320": { + "qualifiedName": "export_data", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3321": { + "qualifiedName": "export_data_csv", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3322": { + "qualifiedName": "export_data_json", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3323": { + "qualifiedName": "log", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3324": { + "qualifiedName": "router", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3325": { + "qualifiedName": "statistics", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3326": { + "qualifiedName": "stop", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3327": { + "qualifiedName": "get_request_manager", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3328": { + "qualifiedName": "get_dataset", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3329": { + "qualifiedName": "get_key_value_store", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3330": { + "qualifiedName": "error_handler", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3331": { + "qualifiedName": "failed_request_handler", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3332": { + "qualifiedName": "on_skipped_request", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3333": { + "qualifiedName": "run", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3334": { + "qualifiedName": "add_requests", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3335": { + "qualifiedName": "get_data", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3336": { + "qualifiedName": "export_data", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3337": { + "qualifiedName": "export_data_csv", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3338": { + "qualifiedName": "export_data_json", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3339": { + "qualifiedName": "log", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3340": { + "qualifiedName": "router", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3341": { + "qualifiedName": "statistics", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3342": { + "qualifiedName": "stop", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3343": { + "qualifiedName": "get_request_manager", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3344": { + "qualifiedName": "get_dataset", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3345": { + "qualifiedName": "get_key_value_store", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3346": { + "qualifiedName": "error_handler", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3347": { + "qualifiedName": "failed_request_handler", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3348": { + "qualifiedName": "on_skipped_request", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3349": { + "qualifiedName": "run", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3350": { + "qualifiedName": "add_requests", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3351": { + "qualifiedName": "get_data", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3352": { + "qualifiedName": "export_data", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3353": { + "qualifiedName": "export_data_csv", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3354": { + "qualifiedName": "export_data_json", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3355": { + "qualifiedName": "create_parsed_http_crawler_class", + "sourceFileName": "/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py" + }, + "3356": { + "qualifiedName": "pre_navigation_hook", + "sourceFileName": "/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py" + }, + "3357": { + "qualifiedName": "log", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3358": { + "qualifiedName": "router", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3359": { + "qualifiedName": "statistics", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3360": { + "qualifiedName": "stop", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3361": { + "qualifiedName": "get_request_manager", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3362": { + "qualifiedName": "get_dataset", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3363": { + "qualifiedName": "get_key_value_store", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3364": { + "qualifiedName": "error_handler", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3365": { + "qualifiedName": "failed_request_handler", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3366": { + "qualifiedName": "on_skipped_request", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3367": { + "qualifiedName": "run", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3368": { + "qualifiedName": "add_requests", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3369": { + "qualifiedName": "get_data", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3370": { + "qualifiedName": "export_data", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3371": { + "qualifiedName": "export_data_csv", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3372": { + "qualifiedName": "export_data_json", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3373": { + "qualifiedName": "create_parsed_http_crawler_class", + "sourceFileName": "/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py" + }, + "3374": { + "qualifiedName": "pre_navigation_hook", + "sourceFileName": "/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py" + }, + "3375": { + "qualifiedName": "log", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3376": { + "qualifiedName": "router", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3377": { + "qualifiedName": "statistics", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3378": { + "qualifiedName": "stop", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3379": { + "qualifiedName": "get_request_manager", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3380": { + "qualifiedName": "get_dataset", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3381": { + "qualifiedName": "get_key_value_store", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3382": { + "qualifiedName": "error_handler", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3383": { + "qualifiedName": "failed_request_handler", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3384": { + "qualifiedName": "on_skipped_request", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3385": { + "qualifiedName": "run", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3386": { + "qualifiedName": "add_requests", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3387": { + "qualifiedName": "get_data", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3388": { + "qualifiedName": "export_data", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3389": { + "qualifiedName": "export_data_csv", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3390": { + "qualifiedName": "export_data_json", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3391": { + "qualifiedName": "create_parsed_http_crawler_class", + "sourceFileName": "/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py" + }, + "3392": { + "qualifiedName": "pre_navigation_hook", + "sourceFileName": "/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py" + }, + "3393": { + "qualifiedName": "log", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3394": { + "qualifiedName": "router", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3395": { + "qualifiedName": "statistics", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3396": { + "qualifiedName": "stop", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3397": { + "qualifiedName": "get_request_manager", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3398": { + "qualifiedName": "get_dataset", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3399": { + "qualifiedName": "get_key_value_store", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3400": { + "qualifiedName": "error_handler", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3401": { + "qualifiedName": "failed_request_handler", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3402": { + "qualifiedName": "on_skipped_request", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3403": { + "qualifiedName": "run", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3404": { + "qualifiedName": "add_requests", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3405": { + "qualifiedName": "get_data", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3406": { + "qualifiedName": "export_data", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3407": { + "qualifiedName": "export_data_csv", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3408": { + "qualifiedName": "export_data_json", + "sourceFileName": "/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "3409": { + "qualifiedName": "request", + "sourceFileName": "/crawlee/_types.py" + }, + "3410": { + "qualifiedName": "session", + "sourceFileName": "/crawlee/_types.py" + }, + "3411": { + "qualifiedName": "proxy_info", + "sourceFileName": "/crawlee/_types.py" + }, + "3412": { + "qualifiedName": "send_request", + "sourceFileName": "/crawlee/_types.py" + }, + "3413": { + "qualifiedName": "add_requests", + "sourceFileName": "/crawlee/_types.py" + }, + "3414": { + "qualifiedName": "push_data", + "sourceFileName": "/crawlee/_types.py" + }, + "3415": { + "qualifiedName": "use_state", + "sourceFileName": "/crawlee/_types.py" + }, + "3416": { + "qualifiedName": "get_key_value_store", + "sourceFileName": "/crawlee/_types.py" + }, + "3417": { + "qualifiedName": "log", + "sourceFileName": "/crawlee/_types.py" + }, + "3418": { + "qualifiedName": "__hash__", + "sourceFileName": "/crawlee/_types.py" + }, + "3419": { + "qualifiedName": "request", + "sourceFileName": "/crawlee/_types.py" + }, + "3420": { + "qualifiedName": "session", + "sourceFileName": "/crawlee/_types.py" + }, + "3421": { + "qualifiedName": "proxy_info", + "sourceFileName": "/crawlee/_types.py" + }, + "3422": { + "qualifiedName": "send_request", + "sourceFileName": "/crawlee/_types.py" + }, + "3423": { + "qualifiedName": "add_requests", + "sourceFileName": "/crawlee/_types.py" + }, + "3424": { + "qualifiedName": "push_data", + "sourceFileName": "/crawlee/_types.py" + }, + "3425": { + "qualifiedName": "use_state", + "sourceFileName": "/crawlee/_types.py" + }, + "3426": { + "qualifiedName": "get_key_value_store", + "sourceFileName": "/crawlee/_types.py" + }, + "3427": { + "qualifiedName": "log", + "sourceFileName": "/crawlee/_types.py" + }, + "3428": { + "qualifiedName": "get_snapshot", + "sourceFileName": "/crawlee/_types.py" + }, + "3429": { + "qualifiedName": "__hash__", + "sourceFileName": "/crawlee/_types.py" + }, + "3430": { + "qualifiedName": "request", + "sourceFileName": "/crawlee/_types.py" + }, + "3431": { + "qualifiedName": "session", + "sourceFileName": "/crawlee/_types.py" + }, + "3432": { + "qualifiedName": "proxy_info", + "sourceFileName": "/crawlee/_types.py" + }, + "3433": { + "qualifiedName": "send_request", + "sourceFileName": "/crawlee/_types.py" + }, + "3434": { + "qualifiedName": "add_requests", + "sourceFileName": "/crawlee/_types.py" + }, + "3435": { + "qualifiedName": "push_data", + "sourceFileName": "/crawlee/_types.py" + }, + "3436": { + "qualifiedName": "use_state", + "sourceFileName": "/crawlee/_types.py" + }, + "3437": { + "qualifiedName": "get_key_value_store", + "sourceFileName": "/crawlee/_types.py" + }, + "3438": { + "qualifiedName": "log", + "sourceFileName": "/crawlee/_types.py" + }, + "3439": { + "qualifiedName": "__hash__", + "sourceFileName": "/crawlee/_types.py" + }, + "3440": { + "qualifiedName": "from_basic_crawling_context", + "sourceFileName": "/crawlee/crawlers/_abstract_http/_http_crawling_context.py" + }, + "3441": { + "qualifiedName": "get_snapshot", + "sourceFileName": "/crawlee/crawlers/_abstract_http/_http_crawling_context.py" + }, + "3442": { + "qualifiedName": "http_response", + "sourceFileName": "/crawlee/http_clients/_base.py" + }, + "3443": { + "qualifiedName": "request", + "sourceFileName": "/crawlee/_types.py" + }, + "3444": { + "qualifiedName": "session", + "sourceFileName": "/crawlee/_types.py" + }, + "3445": { + "qualifiedName": "proxy_info", + "sourceFileName": "/crawlee/_types.py" + }, + "3446": { + "qualifiedName": "send_request", + "sourceFileName": "/crawlee/_types.py" + }, + "3447": { + "qualifiedName": "add_requests", + "sourceFileName": "/crawlee/_types.py" + }, + "3448": { + "qualifiedName": "push_data", + "sourceFileName": "/crawlee/_types.py" + }, + "3449": { + "qualifiedName": "use_state", + "sourceFileName": "/crawlee/_types.py" + }, + "3450": { + "qualifiedName": "get_key_value_store", + "sourceFileName": "/crawlee/_types.py" + }, + "3451": { + "qualifiedName": "log", + "sourceFileName": "/crawlee/_types.py" + }, + "3452": { + "qualifiedName": "__hash__", + "sourceFileName": "/crawlee/_types.py" + }, + "3453": { + "qualifiedName": "parsed_content", + "sourceFileName": "/crawlee/crawlers/_abstract_http/_http_crawling_context.py" + }, + "3454": { + "qualifiedName": "enqueue_links", + "sourceFileName": "/crawlee/crawlers/_abstract_http/_http_crawling_context.py" + }, + "3455": { + "qualifiedName": "extract_links", + "sourceFileName": "/crawlee/crawlers/_abstract_http/_http_crawling_context.py" + }, + "3456": { + "qualifiedName": "from_http_crawling_context", + "sourceFileName": "/crawlee/crawlers/_abstract_http/_http_crawling_context.py" + }, + "3457": { + "qualifiedName": "from_basic_crawling_context", + "sourceFileName": "/crawlee/crawlers/_abstract_http/_http_crawling_context.py" + }, + "3458": { + "qualifiedName": "get_snapshot", + "sourceFileName": "/crawlee/crawlers/_abstract_http/_http_crawling_context.py" + }, + "3459": { + "qualifiedName": "http_response", + "sourceFileName": "/crawlee/http_clients/_base.py" + }, + "3460": { + "qualifiedName": "request", + "sourceFileName": "/crawlee/_types.py" + }, + "3461": { + "qualifiedName": "session", + "sourceFileName": "/crawlee/_types.py" + }, + "3462": { + "qualifiedName": "proxy_info", + "sourceFileName": "/crawlee/_types.py" + }, + "3463": { + "qualifiedName": "send_request", + "sourceFileName": "/crawlee/_types.py" + }, + "3464": { + "qualifiedName": "add_requests", + "sourceFileName": "/crawlee/_types.py" + }, + "3465": { + "qualifiedName": "push_data", + "sourceFileName": "/crawlee/_types.py" + }, + "3466": { + "qualifiedName": "use_state", + "sourceFileName": "/crawlee/_types.py" + }, + "3467": { + "qualifiedName": "get_key_value_store", + "sourceFileName": "/crawlee/_types.py" + }, + "3468": { + "qualifiedName": "log", + "sourceFileName": "/crawlee/_types.py" + }, + "3469": { + "qualifiedName": "__hash__", + "sourceFileName": "/crawlee/_types.py" + }, + "3470": { + "qualifiedName": "parsed_content", + "sourceFileName": "/crawlee/crawlers/_abstract_http/_http_crawling_context.py" + }, + "3471": { + "qualifiedName": "enqueue_links", + "sourceFileName": "/crawlee/crawlers/_abstract_http/_http_crawling_context.py" + }, + "3472": { + "qualifiedName": "extract_links", + "sourceFileName": "/crawlee/crawlers/_abstract_http/_http_crawling_context.py" + }, + "3473": { + "qualifiedName": "from_http_crawling_context", + "sourceFileName": "/crawlee/crawlers/_abstract_http/_http_crawling_context.py" + }, + "3474": { + "qualifiedName": "from_basic_crawling_context", + "sourceFileName": "/crawlee/crawlers/_abstract_http/_http_crawling_context.py" + }, + "3475": { + "qualifiedName": "get_snapshot", + "sourceFileName": "/crawlee/crawlers/_abstract_http/_http_crawling_context.py" + }, + "3476": { + "qualifiedName": "http_response", + "sourceFileName": "/crawlee/http_clients/_base.py" + }, + "3477": { + "qualifiedName": "request", + "sourceFileName": "/crawlee/_types.py" + }, + "3478": { + "qualifiedName": "session", + "sourceFileName": "/crawlee/_types.py" + }, + "3479": { + "qualifiedName": "proxy_info", + "sourceFileName": "/crawlee/_types.py" + }, + "3480": { + "qualifiedName": "send_request", + "sourceFileName": "/crawlee/_types.py" + }, + "3481": { + "qualifiedName": "add_requests", + "sourceFileName": "/crawlee/_types.py" + }, + "3482": { + "qualifiedName": "push_data", + "sourceFileName": "/crawlee/_types.py" + }, + "3483": { + "qualifiedName": "use_state", + "sourceFileName": "/crawlee/_types.py" + }, + "3484": { + "qualifiedName": "get_key_value_store", + "sourceFileName": "/crawlee/_types.py" + }, + "3485": { + "qualifiedName": "log", + "sourceFileName": "/crawlee/_types.py" + }, + "3486": { + "qualifiedName": "__hash__", + "sourceFileName": "/crawlee/_types.py" + }, + "3487": { + "qualifiedName": "parsed_content", + "sourceFileName": "/crawlee/crawlers/_abstract_http/_http_crawling_context.py" + }, + "3488": { + "qualifiedName": "enqueue_links", + "sourceFileName": "/crawlee/crawlers/_abstract_http/_http_crawling_context.py" + }, + "3489": { + "qualifiedName": "extract_links", + "sourceFileName": "/crawlee/crawlers/_abstract_http/_http_crawling_context.py" + }, + "3490": { + "qualifiedName": "from_http_crawling_context", + "sourceFileName": "/crawlee/crawlers/_abstract_http/_http_crawling_context.py" + }, + "3491": { + "qualifiedName": "from_basic_crawling_context", + "sourceFileName": "/crawlee/crawlers/_abstract_http/_http_crawling_context.py" + }, + "3492": { + "qualifiedName": "get_snapshot", + "sourceFileName": "/crawlee/crawlers/_abstract_http/_http_crawling_context.py" + }, + "3493": { + "qualifiedName": "http_response", + "sourceFileName": "/crawlee/http_clients/_base.py" + }, + "3494": { + "qualifiedName": "request", + "sourceFileName": "/crawlee/_types.py" + }, + "3495": { + "qualifiedName": "session", + "sourceFileName": "/crawlee/_types.py" + }, + "3496": { + "qualifiedName": "proxy_info", + "sourceFileName": "/crawlee/_types.py" + }, + "3497": { + "qualifiedName": "send_request", + "sourceFileName": "/crawlee/_types.py" + }, + "3498": { + "qualifiedName": "add_requests", + "sourceFileName": "/crawlee/_types.py" + }, + "3499": { + "qualifiedName": "push_data", + "sourceFileName": "/crawlee/_types.py" + }, + "3500": { + "qualifiedName": "use_state", + "sourceFileName": "/crawlee/_types.py" + }, + "3501": { + "qualifiedName": "get_key_value_store", + "sourceFileName": "/crawlee/_types.py" + }, + "3502": { + "qualifiedName": "log", + "sourceFileName": "/crawlee/_types.py" + }, + "3503": { + "qualifiedName": "__hash__", + "sourceFileName": "/crawlee/_types.py" + }, + "3504": { + "qualifiedName": "page", + "sourceFileName": "/crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py" + }, + "3505": { + "qualifiedName": "block_requests", + "sourceFileName": "/crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py" + }, + "3506": { + "qualifiedName": "get_snapshot", + "sourceFileName": "/crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py" + }, + "3507": { + "qualifiedName": "request", + "sourceFileName": "/crawlee/_types.py" + }, + "3508": { + "qualifiedName": "session", + "sourceFileName": "/crawlee/_types.py" + }, + "3509": { + "qualifiedName": "proxy_info", + "sourceFileName": "/crawlee/_types.py" + }, + "3510": { + "qualifiedName": "send_request", + "sourceFileName": "/crawlee/_types.py" + }, + "3511": { + "qualifiedName": "add_requests", + "sourceFileName": "/crawlee/_types.py" + }, + "3512": { + "qualifiedName": "push_data", + "sourceFileName": "/crawlee/_types.py" + }, + "3513": { + "qualifiedName": "use_state", + "sourceFileName": "/crawlee/_types.py" + }, + "3514": { + "qualifiedName": "get_key_value_store", + "sourceFileName": "/crawlee/_types.py" + }, + "3515": { + "qualifiedName": "log", + "sourceFileName": "/crawlee/_types.py" + }, + "3516": { + "qualifiedName": "__hash__", + "sourceFileName": "/crawlee/_types.py" + }, + "3517": { + "qualifiedName": "active", + "sourceFileName": "/crawlee/events/_event_manager.py" + }, + "3518": { + "qualifiedName": "on", + "sourceFileName": "/crawlee/events/_event_manager.py" + }, + "3519": { + "qualifiedName": "off", + "sourceFileName": "/crawlee/events/_event_manager.py" + }, + "3520": { + "qualifiedName": "emit", + "sourceFileName": "/crawlee/events/_event_manager.py" + }, + "3521": { + "qualifiedName": "wait_for_all_listeners_to_complete", + "sourceFileName": "/crawlee/events/_event_manager.py" + }, + "3522": { + "qualifiedName": "active", + "sourceFileName": "/crawlee/http_clients/_base.py" + }, + "3523": { + "qualifiedName": "__aenter__", + "sourceFileName": "/crawlee/http_clients/_base.py" + }, + "3524": { + "qualifiedName": "__aexit__", + "sourceFileName": "/crawlee/http_clients/_base.py" + }, + "3525": { + "qualifiedName": "active", + "sourceFileName": "/crawlee/http_clients/_base.py" + }, + "3526": { + "qualifiedName": "__aenter__", + "sourceFileName": "/crawlee/http_clients/_base.py" + }, + "3527": { + "qualifiedName": "__aexit__", + "sourceFileName": "/crawlee/http_clients/_base.py" + }, + "3528": { + "qualifiedName": "active", + "sourceFileName": "/crawlee/http_clients/_base.py" + }, + "3529": { + "qualifiedName": "__aenter__", + "sourceFileName": "/crawlee/http_clients/_base.py" + }, + "3530": { + "qualifiedName": "__aexit__", + "sourceFileName": "/crawlee/http_clients/_base.py" + }, + "3531": { + "qualifiedName": "get_total_count", + "sourceFileName": "/crawlee/request_loaders/_request_loader.py" + }, + "3532": { + "qualifiedName": "is_empty", + "sourceFileName": "/crawlee/request_loaders/_request_loader.py" + }, + "3533": { + "qualifiedName": "is_finished", + "sourceFileName": "/crawlee/request_loaders/_request_loader.py" + }, + "3534": { + "qualifiedName": "fetch_next_request", + "sourceFileName": "/crawlee/request_loaders/_request_loader.py" + }, + "3535": { + "qualifiedName": "mark_request_as_handled", + "sourceFileName": "/crawlee/request_loaders/_request_loader.py" + }, + "3536": { + "qualifiedName": "get_handled_count", + "sourceFileName": "/crawlee/request_loaders/_request_loader.py" + }, + "3537": { + "qualifiedName": "to_tandem", + "sourceFileName": "/crawlee/request_loaders/_request_loader.py" + }, + "3538": { + "qualifiedName": "to_tandem", + "sourceFileName": "/crawlee/request_loaders/_request_loader.py" + }, + "3539": { + "qualifiedName": "get_rate_limit_errors", + "sourceFileName": "/crawlee/storage_clients/_base/_storage_client.py" + }, + "3540": { + "qualifiedName": "model_config", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "3541": { + "qualifiedName": "count", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "3542": { + "qualifiedName": "offset", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "3543": { + "qualifiedName": "limit", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "3544": { + "qualifiedName": "total", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "3545": { + "qualifiedName": "desc", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "3546": { + "qualifiedName": "model_config", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "3547": { + "qualifiedName": "count", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "3548": { + "qualifiedName": "offset", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "3549": { + "qualifiedName": "limit", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "3550": { + "qualifiedName": "total", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "3551": { + "qualifiedName": "desc", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "3552": { + "qualifiedName": "model_config", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "3553": { + "qualifiedName": "count", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "3554": { + "qualifiedName": "offset", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "3555": { + "qualifiedName": "limit", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "3556": { + "qualifiedName": "total", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "3557": { + "qualifiedName": "desc", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "3558": { + "qualifiedName": "model_config", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "3559": { + "qualifiedName": "count", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "3560": { + "qualifiedName": "offset", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "3561": { + "qualifiedName": "limit", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "3562": { + "qualifiedName": "total", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "3563": { + "qualifiedName": "desc", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "3564": { + "qualifiedName": "model_config", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "3565": { + "qualifiedName": "limit", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "3566": { + "qualifiedName": "had_multiple_clients", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "3567": { + "qualifiedName": "queue_modified_at", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "3568": { + "qualifiedName": "items", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "3569": { + "qualifiedName": "id", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "3570": { + "qualifiedName": "name", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "3571": { + "qualifiedName": "accessed_at", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "3572": { + "qualifiedName": "created_at", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "3573": { + "qualifiedName": "modified_at", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "3574": { + "qualifiedName": "id", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "3575": { + "qualifiedName": "name", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "3576": { + "qualifiedName": "accessed_at", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "3577": { + "qualifiedName": "created_at", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "3578": { + "qualifiedName": "modified_at", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "3579": { + "qualifiedName": "id", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "3580": { + "qualifiedName": "name", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "3581": { + "qualifiedName": "accessed_at", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "3582": { + "qualifiedName": "created_at", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "3583": { + "qualifiedName": "modified_at", + "sourceFileName": "/crawlee/storage_clients/models.py" + }, + "3584": { + "qualifiedName": "to_tandem", + "sourceFileName": "/crawlee/request_loaders/_request_loader.py" + }, + "3585": { + "qualifiedName": "to_tandem", + "sourceFileName": "/crawlee/request_loaders/_request_loader.py" + }, + "3586": { + "qualifiedName": "model_config", + "sourceFileName": "/crawlee/_request.py" + }, + "3587": { + "qualifiedName": "url", + "sourceFileName": "/crawlee/_request.py" + }, + "3588": { + "qualifiedName": "method", + "sourceFileName": "/crawlee/_request.py" + }, + "3589": { + "qualifiedName": "headers", + "sourceFileName": "/crawlee/_request.py" + }, + "3590": { + "qualifiedName": "payload", + "sourceFileName": "/crawlee/_request.py" + }, + "3591": { + "qualifiedName": "user_data", + "sourceFileName": "/crawlee/_request.py" + }, + "3592": { + "qualifiedName": "retry_count", + "sourceFileName": "/crawlee/_request.py" + }, + "3593": { + "qualifiedName": "no_retry", + "sourceFileName": "/crawlee/_request.py" + }, + "3594": { + "qualifiedName": "loaded_url", + "sourceFileName": "/crawlee/_request.py" + }, + "3595": { + "qualifiedName": "handled_at", + "sourceFileName": "/crawlee/_request.py" + }, + "3596": { + "qualifiedName": "unique_key", + "sourceFileName": "/crawlee/_request.py" + }, + "3597": { + "qualifiedName": "id", + "sourceFileName": "/crawlee/_request.py" + }, + "3598": { + "qualifiedName": "from_url", + "sourceFileName": "/crawlee/_request.py" + }, + "3599": { + "qualifiedName": "get_query_param_from_url", + "sourceFileName": "/crawlee/_request.py" + }, + "3600": { + "qualifiedName": "label", + "sourceFileName": "/crawlee/_request.py" + }, + "3601": { + "qualifiedName": "session_id", + "sourceFileName": "/crawlee/_request.py" + }, + "3602": { + "qualifiedName": "crawlee_data", + "sourceFileName": "/crawlee/_request.py" + }, + "3603": { + "qualifiedName": "crawl_depth", + "sourceFileName": "/crawlee/_request.py" + }, + "3604": { + "qualifiedName": "state", + "sourceFileName": "/crawlee/_request.py" + }, + "3605": { + "qualifiedName": "max_retries", + "sourceFileName": "/crawlee/_request.py" + }, + "3606": { + "qualifiedName": "session_rotation_count", + "sourceFileName": "/crawlee/_request.py" + }, + "3607": { + "qualifiedName": "enqueue_strategy", + "sourceFileName": "/crawlee/_request.py" + }, + "3608": { + "qualifiedName": "last_proxy_tier", + "sourceFileName": "/crawlee/_request.py" + }, + "3609": { + "qualifiedName": "forefront", + "sourceFileName": "/crawlee/_request.py" + }, + "3610": { + "qualifiedName": "limit", + "sourceFileName": "/crawlee/_types.py" + }, + "3611": { + "qualifiedName": "base_url", + "sourceFileName": "/crawlee/_types.py" + }, + "3612": { + "qualifiedName": "strategy", + "sourceFileName": "/crawlee/_types.py" + }, + "3613": { + "qualifiedName": "include", + "sourceFileName": "/crawlee/_types.py" + }, + "3614": { + "qualifiedName": "exclude", + "sourceFileName": "/crawlee/_types.py" + }, + "3615": { + "qualifiedName": "__init__", + "sourceFileName": "/crawlee/errors.py" + } + } +} \ No newline at end of file diff --git a/website/versioned_docs/version-0.6/changelog.md b/website/versioned_docs/version-0.6/changelog.md new file mode 100644 index 0000000000..cfc5359b54 --- /dev/null +++ b/website/versioned_docs/version-0.6/changelog.md @@ -0,0 +1,581 @@ +# Changelog + +All notable changes to this project will be documented in this file. + +## [0.6.12](https://github.com/apify/crawlee-python/releases/tag/v0.6.12) (2025-07-30) + +### ๐Ÿš€ Features + +- Add `retire_browser_after_page_count` parameter for `BrowserPool` ([#1266](https://github.com/apify/crawlee-python/pull/1266)) ([603aa2b](https://github.com/apify/crawlee-python/commit/603aa2b192ef4bc42d88244bd009fffdb0614c06)) by [@Mantisus](https://github.com/Mantisus) + +### ๐Ÿ› Bug Fixes + +- Use `perf_counter_ns` for request duration tracking ([#1260](https://github.com/apify/crawlee-python/pull/1260)) ([9e92f6b](https://github.com/apify/crawlee-python/commit/9e92f6b54400ce5004fbab770e2e4ac42f73148f)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1256](https://github.com/apify/crawlee-python/issues/1256) +- Ignore unknown parameters passed in cookies (#1336) ([0f2610c](https://github.com/apify/crawlee-python/commit/0f2610c0ee1154dc004de60fc57fe7c9f478166a)) +- Fix memory estimation not working on MacOS (#1330) ([8558954](https://github.com/apify/crawlee-python/commit/8558954feeb7d5e91378186974a29851fedae9c8)) +- Fix retry count to not count the original request (#1328) ([1aff3aa](https://github.com/apify/crawlee-python/commit/1aff3aaf0cdbe452a3731192449a445e5b2d7a63)) + + +## [0.6.11](https://github.com/apify/crawlee-python/releases/tag/v0.6.11) (2025-06-23) + +### ๐Ÿš€ Features + +- Add `stream` method for `HttpClient` ([#1241](https://github.com/apify/crawlee-python/pull/1241)) ([95c68b0](https://github.com/apify/crawlee-python/commit/95c68b0b2d0bf9e093c1d0ee1002625172f7a868)) by [@Mantisus](https://github.com/Mantisus) + +### ๐Ÿ› Bug Fixes + +- Fix `ClientSnapshot` overload calculation ([#1228](https://github.com/apify/crawlee-python/pull/1228)) ([a4fc1b6](https://github.com/apify/crawlee-python/commit/a4fc1b6e83143650666108c289c084ea0463b80c)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1207](https://github.com/apify/crawlee-python/issues/1207) +- Use `PSS` instead of `RSS` to estimate children process memory usage on Linux ([#1210](https://github.com/apify/crawlee-python/pull/1210)) ([436032f](https://github.com/apify/crawlee-python/commit/436032f2de5f7d7fa1016033f1bb224159a8e6bf)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1206](https://github.com/apify/crawlee-python/issues/1206) +- Do not raise an error to check 'same-domain' if there is no hostname in the url ([#1251](https://github.com/apify/crawlee-python/pull/1251)) ([a6c3aab](https://github.com/apify/crawlee-python/commit/a6c3aabf5f8341f215275077b6768a56118bc656)) by [@Mantisus](https://github.com/Mantisus) + + +## [0.6.10](https://github.com/apify/crawlee-python/releases/tag/v0.6.10) (2025-06-02) + +### ๐Ÿ› Bug Fixes + +- Allow config change on `PlaywrightCrawler` ([#1186](https://github.com/apify/crawlee-python/pull/1186)) ([f17bf31](https://github.com/apify/crawlee-python/commit/f17bf31456b702631aa7e0c26d4f07fd5eb7d1bd)) by [@mylank](https://github.com/mylank), closes [#1185](https://github.com/apify/crawlee-python/issues/1185) +- Add `payload` to `SendRequestFunction` to support `POST` request ([#1202](https://github.com/apify/crawlee-python/pull/1202)) ([e7449f2](https://github.com/apify/crawlee-python/commit/e7449f206c580cb8383a66e4c9ff5f67c5ce8409)) by [@Mantisus](https://github.com/Mantisus) +- Fix match check for specified enqueue strategy for requests with redirect ([#1199](https://github.com/apify/crawlee-python/pull/1199)) ([d84c30c](https://github.com/apify/crawlee-python/commit/d84c30cbd7c94d6525d3b6e8e86b379050454c0e)) by [@Mantisus](https://github.com/Mantisus), closes [#1198](https://github.com/apify/crawlee-python/issues/1198) +- Set `WindowsSelectorEventLoopPolicy` only for curl-impersonate template without `playwright` ([#1209](https://github.com/apify/crawlee-python/pull/1209)) ([f3b839f](https://github.com/apify/crawlee-python/commit/f3b839ffc2ccc1b889b6d5928f35f57b725e27f1)) by [@Mantisus](https://github.com/Mantisus), closes [#1204](https://github.com/apify/crawlee-python/issues/1204) +- Add support non-GET requests for `PlaywrightCrawler` ([#1208](https://github.com/apify/crawlee-python/pull/1208)) ([dbb9f44](https://github.com/apify/crawlee-python/commit/dbb9f44c71af15e1f86766fa0ba68281dd85fd9e)) by [@Mantisus](https://github.com/Mantisus), closes [#1201](https://github.com/apify/crawlee-python/issues/1201) +- Respect `EnqueueLinksKwargs` for `extract_links` function ([#1213](https://github.com/apify/crawlee-python/pull/1213)) ([c9907d6](https://github.com/apify/crawlee-python/commit/c9907d6ff4c3a4a719b279cea77694c00a5a963d)) by [@Mantisus](https://github.com/Mantisus), closes [#1212](https://github.com/apify/crawlee-python/issues/1212) + + +## [0.6.9](https://github.com/apify/crawlee-python/releases/tag/v0.6.9) (2025-05-02) + +### ๐Ÿš€ Features + +- Add an internal `HttpClient` to be used in `send_request` for `PlaywrightCrawler` using `APIRequestContext` bound to the browser context ([#1134](https://github.com/apify/crawlee-python/pull/1134)) ([e794f49](https://github.com/apify/crawlee-python/commit/e794f4985d3a018ee76d634fe2b2c735fb450272)) by [@Mantisus](https://github.com/Mantisus), closes [#928](https://github.com/apify/crawlee-python/issues/928) +- Make timeout error log cleaner ([#1170](https://github.com/apify/crawlee-python/pull/1170)) ([78ea9d2](https://github.com/apify/crawlee-python/commit/78ea9d23e0b2d73286043b68393e462f636625c9)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1158](https://github.com/apify/crawlee-python/issues/1158) +- Add `on_skipped_request` decorator, to process links skipped according to `robots.txt` rules ([#1166](https://github.com/apify/crawlee-python/pull/1166)) ([bd16f14](https://github.com/apify/crawlee-python/commit/bd16f14a834eebf485aea6b6a83f2b18bf16b504)) by [@Mantisus](https://github.com/Mantisus), closes [#1160](https://github.com/apify/crawlee-python/issues/1160) + +### ๐Ÿ› Bug Fixes + +- Fix handle error without `args` in `_get_error_message` for `ErrorTracker` ([#1181](https://github.com/apify/crawlee-python/pull/1181)) ([21944d9](https://github.com/apify/crawlee-python/commit/21944d908b8404d2ad6c182104e7a8c27be12a6e)) by [@Mantisus](https://github.com/Mantisus), closes [#1179](https://github.com/apify/crawlee-python/issues/1179) +- Temporarily add `certifi<=2025.1.31` dependency ([#1183](https://github.com/apify/crawlee-python/pull/1183)) ([25ff961](https://github.com/apify/crawlee-python/commit/25ff961990f9abc9d0673ba6573dfcf46dd6e53f)) by [@Pijukatel](https://github.com/Pijukatel) + + +## [0.6.8](https://github.com/apify/crawlee-python/releases/tag/v0.6.8) (2025-04-25) + +### ๐Ÿš€ Features + +- Handle unprocessed requests in `add_requests_batched` ([#1159](https://github.com/apify/crawlee-python/pull/1159)) ([7851175](https://github.com/apify/crawlee-python/commit/7851175304d63e455223b25853021cfbe15d68bd)) by [@Pijukatel](https://github.com/Pijukatel), closes [#456](https://github.com/apify/crawlee-python/issues/456) +- Add `respect_robots_txt_file` option ([#1162](https://github.com/apify/crawlee-python/pull/1162)) ([c23f365](https://github.com/apify/crawlee-python/commit/c23f365bfd263b4357edf82c14a7c6ff8dee45e4)) by [@Mantisus](https://github.com/Mantisus) + +### ๐Ÿ› Bug Fixes + +- Update `UnprocessedRequest` to match actual data ([#1155](https://github.com/apify/crawlee-python/pull/1155)) ([a15a1f3](https://github.com/apify/crawlee-python/commit/a15a1f3528c7cbcf78d3bda5a236bcee1d492764)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1150](https://github.com/apify/crawlee-python/issues/1150) +- Fix the order in which cookies are saved to the `SessionCookies` and the handler is executed for `PlaywrightCrawler` ([#1163](https://github.com/apify/crawlee-python/pull/1163)) ([82ff69a](https://github.com/apify/crawlee-python/commit/82ff69acd8e409f56be56dd061aae0f854ec25b4)) by [@Mantisus](https://github.com/Mantisus) +- Call `failed_request_handler` for `SessionError` when session rotation count exceeds maximum ([#1147](https://github.com/apify/crawlee-python/pull/1147)) ([b3637b6](https://github.com/apify/crawlee-python/commit/b3637b68ec7eae9de7f1b923fa2f68885da64b90)) by [@Mantisus](https://github.com/Mantisus) + + +## [0.6.7](https://github.com/apify/crawlee-python/releases/tag/v0.6.7) (2025-04-17) + +### ๐Ÿš€ Features + +- Add `ErrorSnapshotter` to `ErrorTracker` ([#1125](https://github.com/apify/crawlee-python/pull/1125)) ([9666092](https://github.com/apify/crawlee-python/commit/9666092c6a59ac4d34409038d5476e5b6fb58a26)) by [@Pijukatel](https://github.com/Pijukatel), closes [#151](https://github.com/apify/crawlee-python/issues/151) + +### ๐Ÿ› Bug Fixes + +- Improve validation errors in Crawlee CLI ([#1140](https://github.com/apify/crawlee-python/pull/1140)) ([f2d33df](https://github.com/apify/crawlee-python/commit/f2d33dff178a3d3079eb3807feb9645a25cc7a93)) by [@vdusek](https://github.com/vdusek), closes [#1138](https://github.com/apify/crawlee-python/issues/1138) +- Disable logger propagation to prevent duplicate logs ([#1156](https://github.com/apify/crawlee-python/pull/1156)) ([0b3648d](https://github.com/apify/crawlee-python/commit/0b3648d5d399f0af23520f7fb8ee635d38b512c4)) by [@vdusek](https://github.com/vdusek) + + +## [0.6.6](https://github.com/apify/crawlee-python/releases/tag/v0.6.6) (2025-04-03) + +### ๐Ÿš€ Features + +- Add `statistics_log_format` parameter to `BasicCrawler` ([#1061](https://github.com/apify/crawlee-python/pull/1061)) ([635ae4a](https://github.com/apify/crawlee-python/commit/635ae4a56c65e434783ca721f4164203f465abf0)) by [@Mantisus](https://github.com/Mantisus), closes [#700](https://github.com/apify/crawlee-python/issues/700) +- Add Session binding capability via `session_id` in `Request` ([#1086](https://github.com/apify/crawlee-python/pull/1086)) ([cda7b31](https://github.com/apify/crawlee-python/commit/cda7b314ffda3104e4fd28a5e85c9e238d8866a4)) by [@Mantisus](https://github.com/Mantisus), closes [#1076](https://github.com/apify/crawlee-python/issues/1076) +- Add `requests` argument to `EnqueueLinksFunction` ([#1024](https://github.com/apify/crawlee-python/pull/1024)) ([fc8444c](https://github.com/apify/crawlee-python/commit/fc8444c245c7607d3e378a4835d7d3355c4059be)) by [@Pijukatel](https://github.com/Pijukatel) + +### ๐Ÿ› Bug Fixes + +- Add port for `same-origin` strategy check ([#1096](https://github.com/apify/crawlee-python/pull/1096)) ([9e24598](https://github.com/apify/crawlee-python/commit/9e245987d0aab0ba9c763689f12958b5a332db46)) by [@Mantisus](https://github.com/Mantisus) +- Fix handling of loading empty `metadata` file for queue ([#1042](https://github.com/apify/crawlee-python/pull/1042)) ([b00876e](https://github.com/apify/crawlee-python/commit/b00876e8dcb30a12d3737bd31237da9daada46bb)) by [@Mantisus](https://github.com/Mantisus), closes [#1029](https://github.com/apify/crawlee-python/issues/1029) +- Update favicon ([#1114](https://github.com/apify/crawlee-python/pull/1114)) ([eba900f](https://github.com/apify/crawlee-python/commit/eba900fc1e8d918c6fc464657c53004a3e0fe668)) by [@baldasseva](https://github.com/baldasseva) +- **website:** Use correct image source ([#1115](https://github.com/apify/crawlee-python/pull/1115)) ([ee7806f](https://github.com/apify/crawlee-python/commit/ee7806fc2f9b7b590d9668cc9f86009a898a3da6)) by [@baldasseva](https://github.com/baldasseva) + + +## [0.6.5](https://github.com/apify/crawlee-python/releases/tag/v0.6.5) (2025-03-13) + +### ๐Ÿ› Bug Fixes + +- Update to `browserforge` workaround ([#1075](https://github.com/apify/crawlee-python/pull/1075)) ([2378cf8](https://github.com/apify/crawlee-python/commit/2378cf84ab1ed06473049a9ddfca2ba6f166306d)) by [@Pijukatel](https://github.com/Pijukatel) + + +## [0.6.4](https://github.com/apify/crawlee-python/releases/tag/v0.6.4) (2025-03-12) + +### ๐Ÿ› Bug Fixes + +- Add a check thread before set `add_signal_handler` ([#1068](https://github.com/apify/crawlee-python/pull/1068)) ([6983bda](https://github.com/apify/crawlee-python/commit/6983bda2dbc202b3ecbf7db62b11deee007b4b5f)) by [@Mantisus](https://github.com/Mantisus) +- Temporary workaround for `browserforge` import time code execution ([#1073](https://github.com/apify/crawlee-python/pull/1073)) ([17d914f](https://github.com/apify/crawlee-python/commit/17d914f78242078f88c07d686a567d1091255eb1)) by [@Pijukatel](https://github.com/Pijukatel) + + +## [0.6.3](https://github.com/apify/crawlee-python/releases/tag/v0.6.3) (2025-03-07) + +### ๐Ÿš€ Features + +- Add project template with `uv` package manager ([#1057](https://github.com/apify/crawlee-python/pull/1057)) ([9ec06e5](https://github.com/apify/crawlee-python/commit/9ec06e58032aa11af46ac9cd1ea7bb002a18eb13)) by [@Mantisus](https://github.com/Mantisus), closes [#1053](https://github.com/apify/crawlee-python/issues/1053) +- Use fingerprint generator in `PlaywrightCrawler` by default ([#1060](https://github.com/apify/crawlee-python/pull/1060)) ([09cec53](https://github.com/apify/crawlee-python/commit/09cec532911043623eeb475aa8552c70bd94f8b7)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1054](https://github.com/apify/crawlee-python/issues/1054) + +### ๐Ÿ› Bug Fixes + +- Update project templates for Poetry v2.x compatibility ([#1049](https://github.com/apify/crawlee-python/pull/1049)) ([96dc2f9](https://github.com/apify/crawlee-python/commit/96dc2f9b53b0a2d0f1d0c73d10e5244114e849ff)) by [@Mantisus](https://github.com/Mantisus), closes [#954](https://github.com/apify/crawlee-python/issues/954) +- Remove tmp folder for PlaywrightCrawler in non-headless mode ([#1046](https://github.com/apify/crawlee-python/pull/1046)) ([3a7f444](https://github.com/apify/crawlee-python/commit/3a7f444fb7ee9a0ab1867c8c9586b15aab1e7df2)) by [@Mantisus](https://github.com/Mantisus) + + +## [0.6.2](https://github.com/apify/crawlee-python/releases/tag/v0.6.2) (2025-03-05) + +### ๐Ÿš€ Features + +- Extend ErrorTracker with error grouping ([#1014](https://github.com/apify/crawlee-python/pull/1014)) ([561de5c](https://github.com/apify/crawlee-python/commit/561de5c6b76af386cad5ac804a22fb7af227e460)) by [@Pijukatel](https://github.com/Pijukatel) + + +## [0.6.1](https://github.com/apify/crawlee-python/releases/tag/v0.6.1) (2025-03-03) + +### ๐Ÿ› Bug Fixes + +- Add `browserforge` to mandatory dependencies ([#1044](https://github.com/apify/crawlee-python/pull/1044)) ([ddfbde8](https://github.com/apify/crawlee-python/commit/ddfbde89dd3e3cbef0f3954936f4a41c3d6df909)) by [@Pijukatel](https://github.com/Pijukatel) + + +## [0.6.0](https://github.com/apify/crawlee-python/releases/tag/v0.6.0) (2025-03-03) + +### ๐Ÿš€ Features + +- Integrate browserforge fingerprints ([#829](https://github.com/apify/crawlee-python/pull/829)) ([2b156b4](https://github.com/apify/crawlee-python/commit/2b156b4ba688f9111195422e6058dff30eb1f782)) by [@Pijukatel](https://github.com/Pijukatel), closes [#549](https://github.com/apify/crawlee-python/issues/549) +- Add AdaptivePlaywrightCrawler ([#872](https://github.com/apify/crawlee-python/pull/872)) ([5ba70b6](https://github.com/apify/crawlee-python/commit/5ba70b6e846a908a55db461ab0c85e3946f2bc7c)) by [@Pijukatel](https://github.com/Pijukatel) +- Implement `_snapshot_client` for `Snapshotter` ([#957](https://github.com/apify/crawlee-python/pull/957)) ([ba4d384](https://github.com/apify/crawlee-python/commit/ba4d384228d030c20c580ed01fae0e78af3a9543)) by [@Mantisus](https://github.com/Mantisus), closes [#60](https://github.com/apify/crawlee-python/issues/60) +- Add adaptive context helpers ([#964](https://github.com/apify/crawlee-python/pull/964)) ([e248f17](https://github.com/apify/crawlee-python/commit/e248f17fad7b6d1fc5e23a0a1e961db66068a411)) by [@Pijukatel](https://github.com/Pijukatel), closes [#249](https://github.com/apify/crawlee-python/issues/249) +- [**breaking**] Enable additional status codes arguments to PlaywrightCrawler ([#959](https://github.com/apify/crawlee-python/pull/959)) ([87cf446](https://github.com/apify/crawlee-python/commit/87cf446a7cbaa900e28abd93d4c8a2e0d1747059)) by [@Pijukatel](https://github.com/Pijukatel), closes [#953](https://github.com/apify/crawlee-python/issues/953) +- Replace `HeaderGenerator` implementation by `browserforge` implementation ([#960](https://github.com/apify/crawlee-python/pull/960)) ([c2f8c93](https://github.com/apify/crawlee-python/commit/c2f8c93a4ad57c4ede354545bf925bf3707899c9)) by [@Pijukatel](https://github.com/Pijukatel), closes [#937](https://github.com/apify/crawlee-python/issues/937) + +### ๐Ÿ› Bug Fixes + +- Fix playwright template and dockerfile ([#972](https://github.com/apify/crawlee-python/pull/972)) ([c33b34d](https://github.com/apify/crawlee-python/commit/c33b34dd6e253b1261c700857bb5c4bbec6d5c14)) by [@janbuchar](https://github.com/janbuchar), closes [#969](https://github.com/apify/crawlee-python/issues/969) +- Fix installing dependencies via pip in project template ([#977](https://github.com/apify/crawlee-python/pull/977)) ([1e3b8eb](https://github.com/apify/crawlee-python/commit/1e3b8eb1cdb57bf2f7256e8ae5f0706b0afc3ba9)) by [@janbuchar](https://github.com/janbuchar), closes [#975](https://github.com/apify/crawlee-python/issues/975) +- Fix default migration storage ([#1018](https://github.com/apify/crawlee-python/pull/1018)) ([6a0c4d9](https://github.com/apify/crawlee-python/commit/6a0c4d94593f7e94f24eee8a97fc7bc83c4d02e1)) by [@Pijukatel](https://github.com/Pijukatel), closes [#991](https://github.com/apify/crawlee-python/issues/991) +- Fix logger name for http based loggers ([#1023](https://github.com/apify/crawlee-python/pull/1023)) ([bfb3944](https://github.com/apify/crawlee-python/commit/bfb394446351c8f3b9879a9905607f7c929f2542)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1021](https://github.com/apify/crawlee-python/issues/1021) +- Remove allow_redirects override in CurlImpersonateHttpClient ([#1017](https://github.com/apify/crawlee-python/pull/1017)) ([01d855a](https://github.com/apify/crawlee-python/commit/01d855a43389a6b4b16ec74767624fa7eb13151f)) by [@2tunnels](https://github.com/2tunnels), closes [#1016](https://github.com/apify/crawlee-python/issues/1016) +- Remove follow_redirects override in HttpxHttpClient ([#1015](https://github.com/apify/crawlee-python/pull/1015)) ([88afda3](https://github.com/apify/crawlee-python/commit/88afda33e77be84bc91ad1239740b8e661bef2a2)) by [@2tunnels](https://github.com/2tunnels), closes [#1013](https://github.com/apify/crawlee-python/issues/1013) +- Fix flaky test_common_headers_and_user_agent ([#1030](https://github.com/apify/crawlee-python/pull/1030)) ([58aa70e](https://github.com/apify/crawlee-python/commit/58aa70e9600d313b823a1376ab9b36fb416c1c4a)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1027](https://github.com/apify/crawlee-python/issues/1027) + +### Refactor + +- [**breaking**] Remove unused config properties ([#978](https://github.com/apify/crawlee-python/pull/978)) ([4b7fe29](https://github.com/apify/crawlee-python/commit/4b7fe2930540a5fbd753135e3ce29dc80f80c543)) by [@vdusek](https://github.com/vdusek) +- [**breaking**] Remove Base prefix from abstract class names ([#980](https://github.com/apify/crawlee-python/pull/980)) ([8ccb5d4](https://github.com/apify/crawlee-python/commit/8ccb5d41a1dae9b02088b433266ac89bd089561a)) by [@vdusek](https://github.com/vdusek) +- [**breaking**] ะกhange default `incognito context` to `persistent context` for `Playwright` ([#985](https://github.com/apify/crawlee-python/pull/985)) ([f01520d](https://github.com/apify/crawlee-python/commit/f01520d22b31af9f0f13ca162cc47e6aa9744c6d)) by [@Mantisus](https://github.com/Mantisus), closes [#721](https://github.com/apify/crawlee-python/issues/721), [#963](https://github.com/apify/crawlee-python/issues/963) +- [**breaking**] Change `Session` cookies from `dict` to `SessionCookies` with `CookieJar` ([#984](https://github.com/apify/crawlee-python/pull/984)) ([6523b3a](https://github.com/apify/crawlee-python/commit/6523b3ade0eed53b0363ddce250c557024339b5e)) by [@Mantisus](https://github.com/Mantisus), closes [#710](https://github.com/apify/crawlee-python/issues/710), [#933](https://github.com/apify/crawlee-python/issues/933) +- [**breaking**] Replace enum with literal for `EnqueueStrategy` ([#1019](https://github.com/apify/crawlee-python/pull/1019)) ([d2481ef](https://github.com/apify/crawlee-python/commit/d2481ef71d3539979c5b1129387e72b4126fe366)) by [@vdusek](https://github.com/vdusek) +- [**breaking**] Update status code handling ([#1028](https://github.com/apify/crawlee-python/pull/1028)) ([6b59471](https://github.com/apify/crawlee-python/commit/6b5947125e63abdfff481b0669398fc9a7293e55)) by [@Mantisus](https://github.com/Mantisus), closes [#830](https://github.com/apify/crawlee-python/issues/830), [#998](https://github.com/apify/crawlee-python/issues/998) +- [**breaking**] Move `cli` dependencies to optional dependencies ([#1011](https://github.com/apify/crawlee-python/pull/1011)) ([4382959](https://github.com/apify/crawlee-python/commit/43829590c6b4efd1dc9b833373f82a842a0a1a8e)) by [@Mantisus](https://github.com/Mantisus), closes [#703](https://github.com/apify/crawlee-python/issues/703), [#1010](https://github.com/apify/crawlee-python/issues/1010) + + +## [0.5.4](https://github.com/apify/crawlee-python/releases/tag/v0.5.4) (2025-02-05) + +### ๐Ÿš€ Features + +- Add support `use_incognito_pages` for `browser_launch_options` in `PlaywrightCrawler` ([#941](https://github.com/apify/crawlee-python/pull/941)) ([eae3a33](https://github.com/apify/crawlee-python/commit/eae3a33a1842ebbdac5f9c51866a4be4bcf1ae2c)) by [@Mantisus](https://github.com/Mantisus) + +### ๐Ÿ› Bug Fixes + +- Fix session management with retire ([#947](https://github.com/apify/crawlee-python/pull/947)) ([caee03f](https://github.com/apify/crawlee-python/commit/caee03fe3a43cc1d7a8d3f9e19b42df1bdb1c0aa)) by [@Mantisus](https://github.com/Mantisus) +- Fix templates - poetry-plugin-export version and camoufox template name ([#952](https://github.com/apify/crawlee-python/pull/952)) ([7addea6](https://github.com/apify/crawlee-python/commit/7addea6605359cceba208e16ec9131724bdb3e9b)) by [@Pijukatel](https://github.com/Pijukatel), closes [#951](https://github.com/apify/crawlee-python/issues/951) +- Fix convert relative link to absolute in `enqueue_links` for response with redirect ([#956](https://github.com/apify/crawlee-python/pull/956)) ([694102e](https://github.com/apify/crawlee-python/commit/694102e163bb9021a4830d2545d153f6f8f3de90)) by [@Mantisus](https://github.com/Mantisus), closes [#955](https://github.com/apify/crawlee-python/issues/955) +- Fix `CurlImpersonateHttpClient` cookies handler ([#946](https://github.com/apify/crawlee-python/pull/946)) ([ed415c4](https://github.com/apify/crawlee-python/commit/ed415c433da2a40b0ee62534f0730d0737e991b8)) by [@Mantisus](https://github.com/Mantisus) + + +## [0.5.3](https://github.com/apify/crawlee-python/releases/tag/v0.5.3) (2025-01-31) + +### ๐Ÿš€ Features + +- Add keep_alive flag to `crawler.__init__` ([#921](https://github.com/apify/crawlee-python/pull/921)) ([7a82d0c](https://github.com/apify/crawlee-python/commit/7a82d0cbdbe6c8739d4bf6a9b014e31f07e5a520)) by [@Pijukatel](https://github.com/Pijukatel), closes [#891](https://github.com/apify/crawlee-python/issues/891) +- Add `block_requests` helper for `PlaywrightCrawler` ([#919](https://github.com/apify/crawlee-python/pull/919)) ([1030459](https://github.com/apify/crawlee-python/commit/103045994908f80cffee5ccfff91a040e0042f48)) by [@Mantisus](https://github.com/Mantisus), closes [#848](https://github.com/apify/crawlee-python/issues/848) +- Return request handlers from decorator methods to allow further decoration ([#934](https://github.com/apify/crawlee-python/pull/934)) ([9ec0aae](https://github.com/apify/crawlee-python/commit/9ec0aae54e2a340d29c893567ae80bf8bd4510a9)) by [@mylank](https://github.com/mylank) +- Add `transform_request_function` for `enqueue_links` ([#923](https://github.com/apify/crawlee-python/pull/923)) ([6b15957](https://github.com/apify/crawlee-python/commit/6b159578f612251e6d2253a72b6521430f4f9b09)) by [@Mantisus](https://github.com/Mantisus), closes [#894](https://github.com/apify/crawlee-python/issues/894) +- Add `time_remaining_secs` property to `MIGRATING` event data ([#940](https://github.com/apify/crawlee-python/pull/940)) ([b44501b](https://github.com/apify/crawlee-python/commit/b44501bcadbd12673a8f47aa92f12da8e404f60b)) by [@fnesveda](https://github.com/fnesveda) +- Add LogisticalRegressionPredictor - rendering type predictor for adaptive crawling ([#930](https://github.com/apify/crawlee-python/pull/930)) ([8440499](https://github.com/apify/crawlee-python/commit/8440499468db115a4c478e9bcdb692554d1655c5)) by [@Pijukatel](https://github.com/Pijukatel) + +### ๐Ÿ› Bug Fixes + +- Fix crawler not retrying user handler if there was timeout in the handler ([#909](https://github.com/apify/crawlee-python/pull/909)) ([f4090ef](https://github.com/apify/crawlee-python/commit/f4090ef0ea0281d53dab16a77ceea2ef6ac43d76)) by [@Pijukatel](https://github.com/Pijukatel), closes [#907](https://github.com/apify/crawlee-python/issues/907) +- Optimize memory consumption for `HttpxHttpClient`, fix proxy handling ([#905](https://github.com/apify/crawlee-python/pull/905)) ([d7ad480](https://github.com/apify/crawlee-python/commit/d7ad480834263ae0480049cb0a8db4dfc3946d8d)) by [@Mantisus](https://github.com/Mantisus), closes [#895](https://github.com/apify/crawlee-python/issues/895) +- Fix `BrowserPool` and `PlaywrightBrowserPlugin` closure ([#932](https://github.com/apify/crawlee-python/pull/932)) ([997543d](https://github.com/apify/crawlee-python/commit/997543d2fa5afba49929f4407ee95d7a4933a50d)) by [@Mantisus](https://github.com/Mantisus) + + +## [0.5.2](https://github.com/apify/crawlee-python/releases/tag/v0.5.2) (2025-01-17) + +### ๐Ÿ› Bug Fixes + +- Avoid `use_state` race conditions. Remove key argument to `use_state` ([#868](https://github.com/apify/crawlee-python/pull/868)) ([000b976](https://github.com/apify/crawlee-python/commit/000b9761211502d86a893a31e3ca21998a6e3b99)) by [@Pijukatel](https://github.com/Pijukatel), closes [#856](https://github.com/apify/crawlee-python/issues/856) +- Restore proxy functionality for PlaywrightCrawler broken in v0.5 ([#889](https://github.com/apify/crawlee-python/pull/889)) ([908c944](https://github.com/apify/crawlee-python/commit/908c944ff9b1fc8ed7eb35f0078a1de71e34d5c5)) by [@Mantisus](https://github.com/Mantisus), closes [#887](https://github.com/apify/crawlee-python/issues/887) +- Fix the usage of Configuration ([#899](https://github.com/apify/crawlee-python/pull/899)) ([0f1cf6f](https://github.com/apify/crawlee-python/commit/0f1cf6f0b52c92ca4e465a2a01f8111cd9ab42ec)) by [@vdusek](https://github.com/vdusek), closes [#670](https://github.com/apify/crawlee-python/issues/670) + + +## [0.5.1](https://github.com/apify/crawlee-python/releases/tag/v0.5.1) (2025-01-07) + +### ๐Ÿ› Bug Fixes + +- Make result of RequestList.is_empty independent of fetch_next_request calls ([#876](https://github.com/apify/crawlee-python/pull/876)) ([d50249e](https://github.com/apify/crawlee-python/commit/d50249ecbfe2a04f508fcdc3261e050349bd0da2)) by [@janbuchar](https://github.com/janbuchar) + + +## [0.5.0](https://github.com/apify/crawlee-python/releases/tag/v0.5.0) (2025-01-02) + +### ๐Ÿš€ Features + +- Add possibility to use None as no proxy in tiered proxies ([#760](https://github.com/apify/crawlee-python/pull/760)) ([0fbd017](https://github.com/apify/crawlee-python/commit/0fbd01723b9fe2e3410e0f358cab2f22848b08d0)) by [@Pijukatel](https://github.com/Pijukatel), closes [#687](https://github.com/apify/crawlee-python/issues/687) +- Add `use_state` context method ([#682](https://github.com/apify/crawlee-python/pull/682)) ([868b41e](https://github.com/apify/crawlee-python/commit/868b41ebd4c8003fa60ab07887577d0fb85b6ecc)) by [@Mantisus](https://github.com/Mantisus), closes [#191](https://github.com/apify/crawlee-python/issues/191) +- Add pre-navigation hooks router to AbstractHttpCrawler ([#791](https://github.com/apify/crawlee-python/pull/791)) ([0f23205](https://github.com/apify/crawlee-python/commit/0f23205923065074c522b3de9d47218a204dfa78)) by [@Pijukatel](https://github.com/Pijukatel), closes [#635](https://github.com/apify/crawlee-python/issues/635) +- Add example of how to integrate Camoufox into PlaywrightCrawler ([#789](https://github.com/apify/crawlee-python/pull/789)) ([246cfc4](https://github.com/apify/crawlee-python/commit/246cfc4ebc8bce1d15e1dddd62d652bd65869328)) by [@Pijukatel](https://github.com/Pijukatel), closes [#684](https://github.com/apify/crawlee-python/issues/684) +- Expose event types, improve on/emit signature, allow parameterless listeners ([#800](https://github.com/apify/crawlee-python/pull/800)) ([c102c4c](https://github.com/apify/crawlee-python/commit/c102c4c894a00b09adfd5f4911563c81cf3e98b4)) by [@janbuchar](https://github.com/janbuchar), closes [#561](https://github.com/apify/crawlee-python/issues/561) +- Add stop method to BasicCrawler ([#807](https://github.com/apify/crawlee-python/pull/807)) ([6d01af4](https://github.com/apify/crawlee-python/commit/6d01af4231d02b4349a8719f5ed18d812843fde5)) by [@Pijukatel](https://github.com/Pijukatel), closes [#651](https://github.com/apify/crawlee-python/issues/651) +- Add `html_to_text` helper function ([#792](https://github.com/apify/crawlee-python/pull/792)) ([2b9d970](https://github.com/apify/crawlee-python/commit/2b9d97009dd653870681bb3cadbb46b214ff1a73)) by [@Pijukatel](https://github.com/Pijukatel), closes [#659](https://github.com/apify/crawlee-python/issues/659) +- [**breaking**] Implement `RequestManagerTandem`, remove `add_request` from `RequestList`, accept any iterable in `RequestList` constructor ([#777](https://github.com/apify/crawlee-python/pull/777)) ([4172652](https://github.com/apify/crawlee-python/commit/4172652079e5e91190c1cc5e2138fd41a7c84a6b)) by [@janbuchar](https://github.com/janbuchar) + +### ๐Ÿ› Bug Fixes + +- Fix circular import in `KeyValueStore` ([#805](https://github.com/apify/crawlee-python/pull/805)) ([8bdf49d](https://github.com/apify/crawlee-python/commit/8bdf49d1cb2a94b66f69fd1b77063a4113517fae)) by [@Mantisus](https://github.com/Mantisus), closes [#804](https://github.com/apify/crawlee-python/issues/804) +- [**breaking**] Refactor service usage to rely on `service_locator` ([#691](https://github.com/apify/crawlee-python/pull/691)) ([1d31c6c](https://github.com/apify/crawlee-python/commit/1d31c6c7e7a9ec7cee5b2de900568d9f77db65ba)) by [@vdusek](https://github.com/vdusek), closes [#369](https://github.com/apify/crawlee-python/issues/369), [#539](https://github.com/apify/crawlee-python/issues/539), [#699](https://github.com/apify/crawlee-python/issues/699) +- Pass `verify` in httpx client ([#802](https://github.com/apify/crawlee-python/pull/802)) ([074d083](https://github.com/apify/crawlee-python/commit/074d0836b55e52f13726e7cd1c21602623fda4fc)) by [@Mantisus](https://github.com/Mantisus), closes [#798](https://github.com/apify/crawlee-python/issues/798) +- Fix `page_options` for `PlaywrightBrowserPlugin` ([#796](https://github.com/apify/crawlee-python/pull/796)) ([bd3bdd4](https://github.com/apify/crawlee-python/commit/bd3bdd4046c2ddea62feb77322033cad50f382dd)) by [@Mantisus](https://github.com/Mantisus), closes [#755](https://github.com/apify/crawlee-python/issues/755) +- Fix event migrating handler in `RequestQueue` ([#825](https://github.com/apify/crawlee-python/pull/825)) ([fd6663f](https://github.com/apify/crawlee-python/commit/fd6663f903bc7eecd1000da89e06197b43dfb962)) by [@Mantisus](https://github.com/Mantisus), closes [#815](https://github.com/apify/crawlee-python/issues/815) +- Respect user configuration for work with status codes ([#812](https://github.com/apify/crawlee-python/pull/812)) ([8daf4bd](https://github.com/apify/crawlee-python/commit/8daf4bd49c1b09a0924f827daedebf7600ac609b)) by [@Mantisus](https://github.com/Mantisus), closes [#708](https://github.com/apify/crawlee-python/issues/708), [#756](https://github.com/apify/crawlee-python/issues/756) +- `abort-on-error` for successive runs ([#834](https://github.com/apify/crawlee-python/pull/834)) ([0cea673](https://github.com/apify/crawlee-python/commit/0cea67387bf366800b447de784af580159b199ee)) by [@Mantisus](https://github.com/Mantisus) +- Relax ServiceLocator restrictions ([#837](https://github.com/apify/crawlee-python/pull/837)) ([aa3667f](https://github.com/apify/crawlee-python/commit/aa3667f344d78945df3eca77431e1409f43f8bb5)) by [@janbuchar](https://github.com/janbuchar), closes [#806](https://github.com/apify/crawlee-python/issues/806) +- Fix typo in exports ([#841](https://github.com/apify/crawlee-python/pull/841)) ([8fa6ac9](https://github.com/apify/crawlee-python/commit/8fa6ac994fe4f3f6430cb796a0c6a732c93c672b)) by [@janbuchar](https://github.com/janbuchar) + +### Refactor + +- [**breaking**] Refactor HttpCrawler, BeautifulSoupCrawler, ParselCrawler inheritance ([#746](https://github.com/apify/crawlee-python/pull/746)) ([9d3c269](https://github.com/apify/crawlee-python/commit/9d3c2697c91ce93028ca86a91d85d465d36c1ad7)) by [@Pijukatel](https://github.com/Pijukatel), closes [#350](https://github.com/apify/crawlee-python/issues/350) +- [**breaking**] Remove `json_` and `order_no` from `Request` ([#788](https://github.com/apify/crawlee-python/pull/788)) ([5381d13](https://github.com/apify/crawlee-python/commit/5381d13aa51a757fc1906f400788555df090a1af)) by [@Mantisus](https://github.com/Mantisus), closes [#94](https://github.com/apify/crawlee-python/issues/94) +- [**breaking**] Rename PwPreNavContext to PwPreNavCrawlingContext ([#827](https://github.com/apify/crawlee-python/pull/827)) ([84b61a3](https://github.com/apify/crawlee-python/commit/84b61a3d25bee42faed4e81cd156663f251b3d3d)) by [@vdusek](https://github.com/vdusek) +- [**breaking**] Rename PlaywrightCrawler kwargs: browser_options, page_options ([#831](https://github.com/apify/crawlee-python/pull/831)) ([ffc6048](https://github.com/apify/crawlee-python/commit/ffc6048e9dc5c5e862271fa50c48bb0fb6f0a18f)) by [@Pijukatel](https://github.com/Pijukatel) +- [**breaking**] Update the crawlers & storage clients structure ([#828](https://github.com/apify/crawlee-python/pull/828)) ([0ba04d1](https://github.com/apify/crawlee-python/commit/0ba04d1633881043928a408678932c46fb90e21f)) by [@vdusek](https://github.com/vdusek), closes [#764](https://github.com/apify/crawlee-python/issues/764) + + +## [0.4.5](https://github.com/apify/crawlee-python/releases/tag/v0.4.5) (2024-12-06) + +### ๐Ÿš€ Features + +- Improve project bootstrapping ([#538](https://github.com/apify/crawlee-python/pull/538)) ([367899c](https://github.com/apify/crawlee-python/commit/367899cbad5021674f6e41c4dd7eb2266fe043aa)) by [@janbuchar](https://github.com/janbuchar), closes [#317](https://github.com/apify/crawlee-python/issues/317), [#414](https://github.com/apify/crawlee-python/issues/414), [#495](https://github.com/apify/crawlee-python/issues/495), [#511](https://github.com/apify/crawlee-python/issues/511) + +### ๐Ÿ› Bug Fixes + +- Add upper bound of HTTPX version ([#775](https://github.com/apify/crawlee-python/pull/775)) ([b59e34d](https://github.com/apify/crawlee-python/commit/b59e34d6301e26825d88608152ffb337ef602a9f)) by [@vdusek](https://github.com/vdusek) +- Fix incorrect use of desired concurrency ratio ([#780](https://github.com/apify/crawlee-python/pull/780)) ([d1f8bfb](https://github.com/apify/crawlee-python/commit/d1f8bfb68ce2ef13b550ce415a3689858112a4c7)) by [@Pijukatel](https://github.com/Pijukatel), closes [#759](https://github.com/apify/crawlee-python/issues/759) +- Remove pydantic constraint <2.10.0 and update timedelta validator, serializer type hints ([#757](https://github.com/apify/crawlee-python/pull/757)) ([c0050c0](https://github.com/apify/crawlee-python/commit/c0050c0ee76e5deb28f174ecf276b0e6abf68b9d)) by [@Pijukatel](https://github.com/Pijukatel) + + +## [0.4.4](https://github.com/apify/crawlee-python/releases/tag/v0.4.4) (2024-11-29) + +### ๐Ÿš€ Features + +- Expose browser_options and page_options to PlaywrightCrawler ([#730](https://github.com/apify/crawlee-python/pull/730)) ([dbe85b9](https://github.com/apify/crawlee-python/commit/dbe85b90e59def281cfc6617a0eb869a4adf2fc0)) by [@vdusek](https://github.com/vdusek), closes [#719](https://github.com/apify/crawlee-python/issues/719) +- Add `abort_on_error` property ([#731](https://github.com/apify/crawlee-python/pull/731)) ([6dae03a](https://github.com/apify/crawlee-python/commit/6dae03a68a2d23c68c78d8d44611d43e40eb9404)) by [@Mantisus](https://github.com/Mantisus), closes [#704](https://github.com/apify/crawlee-python/issues/704) + +### ๐Ÿ› Bug Fixes + +- Fix init of context managers and context handling in `BasicCrawler` ([#714](https://github.com/apify/crawlee-python/pull/714)) ([486fe6d](https://github.com/apify/crawlee-python/commit/486fe6d6cd56cb560ab51a32ec0286d9e32267cb)) by [@vdusek](https://github.com/vdusek) + + +## [0.4.3](https://github.com/apify/crawlee-python/releases/tag/v0.4.3) (2024-11-21) + +### ๐Ÿ› Bug Fixes + +- Pydantic 2.10.0 issues ([#716](https://github.com/apify/crawlee-python/pull/716)) ([8d8b3fc](https://github.com/apify/crawlee-python/commit/8d8b3fcff8be10edf5351f5324c7ba112c1d2ba0)) by [@Pijukatel](https://github.com/Pijukatel) + + +## [0.4.2](https://github.com/apify/crawlee-python/releases/tag/v0.4.2) (2024-11-20) + +### ๐Ÿ› Bug Fixes + +- Respect custom HTTP headers in `PlaywrightCrawler` ([#685](https://github.com/apify/crawlee-python/pull/685)) ([a84125f](https://github.com/apify/crawlee-python/commit/a84125f031347426de44b8f015c87882c8f96f72)) by [@Mantisus](https://github.com/Mantisus) +- Fix serialization payload in Request. Fix Docs for Post Request ([#683](https://github.com/apify/crawlee-python/pull/683)) ([e8b4d2d](https://github.com/apify/crawlee-python/commit/e8b4d2d4989fd9967403b828c914cb7ae2ef9b8b)) by [@Mantisus](https://github.com/Mantisus), closes [#668](https://github.com/apify/crawlee-python/issues/668) +- Accept string payload in the Request constructor ([#697](https://github.com/apify/crawlee-python/pull/697)) ([19f5add](https://github.com/apify/crawlee-python/commit/19f5addc0223d68389eea47864830c709335ab6e)) by [@vdusek](https://github.com/vdusek) +- Fix snapshots handling ([#692](https://github.com/apify/crawlee-python/pull/692)) ([4016c0d](https://github.com/apify/crawlee-python/commit/4016c0d8121a8950ab1df22188eac838a011c39f)) by [@Pijukatel](https://github.com/Pijukatel) + + +## [0.4.1](https://github.com/apify/crawlee-python/releases/tag/v0.4.1) (2024-11-11) + +### ๐Ÿš€ Features + +- Add `max_crawl_depth` option to `BasicCrawler` ([#637](https://github.com/apify/crawlee-python/pull/637)) ([77deaa9](https://github.com/apify/crawlee-python/commit/77deaa964e2c1e74af1c5117a13d8d8257f0e27e)) by [@Prathamesh010](https://github.com/Prathamesh010), closes [#460](https://github.com/apify/crawlee-python/issues/460) +- Add BeautifulSoupParser type alias ([#674](https://github.com/apify/crawlee-python/pull/674)) ([b2cf88f](https://github.com/apify/crawlee-python/commit/b2cf88ffea8d75808c9210850a03fcc70b0b9e3d)) by [@Pijukatel](https://github.com/Pijukatel) + +### ๐Ÿ› Bug Fixes + +- Fix total_size usage in memory size monitoring ([#661](https://github.com/apify/crawlee-python/pull/661)) ([c2a3239](https://github.com/apify/crawlee-python/commit/c2a32397eecd5cc7f412c2af7269b004a8b2eaf2)) by [@janbuchar](https://github.com/janbuchar) +- Add HttpHeaders to module exports ([#664](https://github.com/apify/crawlee-python/pull/664)) ([f0c5ca7](https://github.com/apify/crawlee-python/commit/f0c5ca717d9f9e304d375da2c23552c26ca870da)) by [@vdusek](https://github.com/vdusek), closes [#663](https://github.com/apify/crawlee-python/issues/663) +- Fix unhandled ValueError in request handler result processing ([#666](https://github.com/apify/crawlee-python/pull/666)) ([0a99d7f](https://github.com/apify/crawlee-python/commit/0a99d7f693245eb9a065016fb6f2d268f6956805)) by [@janbuchar](https://github.com/janbuchar) +- Fix BaseDatasetClient.iter_items type hints ([#680](https://github.com/apify/crawlee-python/pull/680)) ([a968b1b](https://github.com/apify/crawlee-python/commit/a968b1be6fceb56676b0198a044c8fceac7c92a6)) by [@Pijukatel](https://github.com/Pijukatel) + + +## [0.4.0](https://github.com/apify/crawlee-python/releases/tag/v0.4.0) (2024-11-01) + +### ๐Ÿš€ Features + +- [**breaking**] Add headers in unique key computation ([#609](https://github.com/apify/crawlee-python/pull/609)) ([6c4746f](https://github.com/apify/crawlee-python/commit/6c4746fa8ff86952a812b32a1d70dc910e76b43e)) by [@Prathamesh010](https://github.com/Prathamesh010), closes [#548](https://github.com/apify/crawlee-python/issues/548) +- Add `pre_navigation_hooks` to `PlaywrightCrawler` ([#631](https://github.com/apify/crawlee-python/pull/631)) ([5dd5b60](https://github.com/apify/crawlee-python/commit/5dd5b60e2a44d5bd3748b613790e1bee3232d6f3)) by [@Prathamesh010](https://github.com/Prathamesh010), closes [#427](https://github.com/apify/crawlee-python/issues/427) +- Add `always_enqueue` option to bypass URL deduplication ([#621](https://github.com/apify/crawlee-python/pull/621)) ([4e59fa4](https://github.com/apify/crawlee-python/commit/4e59fa46daaec05e52262cf62c26f28ddcd772af)) by [@Rutam21](https://github.com/Rutam21), closes [#547](https://github.com/apify/crawlee-python/issues/547) +- Split and add extra configuration to export_data method ([#580](https://github.com/apify/crawlee-python/pull/580)) ([6751635](https://github.com/apify/crawlee-python/commit/6751635e1785a4a27f60092c82f5dd0c40193d52)) by [@deshansh](https://github.com/deshansh), closes [#526](https://github.com/apify/crawlee-python/issues/526) + +### ๐Ÿ› Bug Fixes + +- Use strip in headers normalization ([#614](https://github.com/apify/crawlee-python/pull/614)) ([a15b21e](https://github.com/apify/crawlee-python/commit/a15b21e51deaf2b67738f95bc2b15c1c16d1775f)) by [@vdusek](https://github.com/vdusek) +- [**breaking**] Merge payload and data fields of Request ([#542](https://github.com/apify/crawlee-python/pull/542)) ([d06fcef](https://github.com/apify/crawlee-python/commit/d06fcef3fee44616ded5f587b9c7313b82a57cc7)) by [@vdusek](https://github.com/vdusek), closes [#560](https://github.com/apify/crawlee-python/issues/560) +- Default ProxyInfo port if httpx.URL port is None ([#619](https://github.com/apify/crawlee-python/pull/619)) ([8107a6f](https://github.com/apify/crawlee-python/commit/8107a6f97e8f16a330e7d02d3fc6ea34c5f78d77)) by [@steffansafey](https://github.com/steffansafey), closes [#618](https://github.com/apify/crawlee-python/issues/618) + +### Chore + +- [**breaking**] Remove Request.query_params field ([#639](https://github.com/apify/crawlee-python/pull/639)) ([6ec0ec4](https://github.com/apify/crawlee-python/commit/6ec0ec4fa0cef9b8bf893e70d99f068675c9c54c)) by [@vdusek](https://github.com/vdusek), closes [#615](https://github.com/apify/crawlee-python/issues/615) + + +## [0.3.9](https://github.com/apify/crawlee-python/releases/tag/v0.3.9) (2024-10-23) + +### ๐Ÿš€ Features + +- Key-value store context helpers ([#584](https://github.com/apify/crawlee-python/pull/584)) ([fc15622](https://github.com/apify/crawlee-python/commit/fc156222c3747fc4cc7bd7666a21769845c7d0d5)) by [@janbuchar](https://github.com/janbuchar) +- Added get_public_url method to KeyValueStore ([#572](https://github.com/apify/crawlee-python/pull/572)) ([3a4ba8f](https://github.com/apify/crawlee-python/commit/3a4ba8f459903b6288aec40de2c3ca862e36abec)) by [@akshay11298](https://github.com/akshay11298), closes [#514](https://github.com/apify/crawlee-python/issues/514) + +### ๐Ÿ› Bug Fixes + +- Workaround for JSON value typing problems ([#581](https://github.com/apify/crawlee-python/pull/581)) ([403496a](https://github.com/apify/crawlee-python/commit/403496a53c12810351139a6e073238143ecc5930)) by [@janbuchar](https://github.com/janbuchar), closes [#563](https://github.com/apify/crawlee-python/issues/563) + + +## [0.3.8](https://github.com/apify/crawlee-python/releases/tag/v0.3.8) (2024-10-02) + +### ๐Ÿš€ Features + +- Mask Playwright's "headless" headers ([#545](https://github.com/apify/crawlee-python/pull/545)) ([d1445e4](https://github.com/apify/crawlee-python/commit/d1445e4858fd804bb4a2e35efa1d2f5254d8df6b)) by [@vdusek](https://github.com/vdusek), closes [#401](https://github.com/apify/crawlee-python/issues/401) +- Add new model for `HttpHeaders` ([#544](https://github.com/apify/crawlee-python/pull/544)) ([854f2c1](https://github.com/apify/crawlee-python/commit/854f2c1e2e09cf398e04b1e153534282add1247e)) by [@vdusek](https://github.com/vdusek) + +### ๐Ÿ› Bug Fixes + +- Call `error_handler` for `SessionError` ([#557](https://github.com/apify/crawlee-python/pull/557)) ([e75ac4b](https://github.com/apify/crawlee-python/commit/e75ac4b70cd48a4ca9f8245cea3c5f3c188b8824)) by [@vdusek](https://github.com/vdusek), closes [#546](https://github.com/apify/crawlee-python/issues/546) +- Extend from `StrEnum` in `RequestState` to fix serialization ([#556](https://github.com/apify/crawlee-python/pull/556)) ([6bf35ba](https://github.com/apify/crawlee-python/commit/6bf35ba4a6913819706ebd1d2c1156a4c62f944e)) by [@vdusek](https://github.com/vdusek), closes [#551](https://github.com/apify/crawlee-python/issues/551) +- Add equality check to UserData model ([#562](https://github.com/apify/crawlee-python/pull/562)) ([899a25c](https://github.com/apify/crawlee-python/commit/899a25ca63f570b3c4d8d56c85a838b371fd3924)) by [@janbuchar](https://github.com/janbuchar) + + +## [0.3.7](https://github.com/apify/crawlee-python/releases/tag/v0.3.7) (2024-09-25) + +### ๐Ÿ› Bug Fixes + +- Improve `Request.user_data` serialization ([#540](https://github.com/apify/crawlee-python/pull/540)) ([de29c0e](https://github.com/apify/crawlee-python/commit/de29c0e6b737a9d2544c5382472618dde76eb2a5)) by [@janbuchar](https://github.com/janbuchar), closes [#524](https://github.com/apify/crawlee-python/issues/524) +- Adopt new version of curl-cffi ([#543](https://github.com/apify/crawlee-python/pull/543)) ([f6fcf48](https://github.com/apify/crawlee-python/commit/f6fcf48d99bfcb4b8e75c5c9c38dc8c265164a10)) by [@vdusek](https://github.com/vdusek) + + +## [0.3.6](https://github.com/apify/crawlee-python/releases/tag/v0.3.6) (2024-09-19) + +### ๐Ÿš€ Features + +- Add HTTP/2 support for HTTPX client ([#513](https://github.com/apify/crawlee-python/pull/513)) ([0eb0a33](https://github.com/apify/crawlee-python/commit/0eb0a33411096011198e52c393f35730f1a0b6ac)) by [@vdusek](https://github.com/vdusek), closes [#512](https://github.com/apify/crawlee-python/issues/512) +- Expose extended unique key when creating a new Request ([#515](https://github.com/apify/crawlee-python/pull/515)) ([1807f41](https://github.com/apify/crawlee-python/commit/1807f419e47a815dd706d09acb0f3b3af8cfc691)) by [@vdusek](https://github.com/vdusek) +- Add header generator and integrate it into HTTPX client ([#530](https://github.com/apify/crawlee-python/pull/530)) ([b63f9f9](https://github.com/apify/crawlee-python/commit/b63f9f98c6613e095546ef544eab271d433e3379)) by [@vdusek](https://github.com/vdusek), closes [#402](https://github.com/apify/crawlee-python/issues/402) + +### ๐Ÿ› Bug Fixes + +- Use explicitly UTF-8 encoding in local storage ([#533](https://github.com/apify/crawlee-python/pull/533)) ([a3a0ab2](https://github.com/apify/crawlee-python/commit/a3a0ab2f6809b7a06319a77dfbf289df78638dea)) by [@vdusek](https://github.com/vdusek), closes [#532](https://github.com/apify/crawlee-python/issues/532) + + +## [0.3.5](https://github.com/apify/crawlee-python/releases/tag/v0.3.5) (2024-09-10) + +### ๐Ÿš€ Features + +- Memory usage limit configuration via environment variables ([#502](https://github.com/apify/crawlee-python/pull/502)) ([c62e554](https://github.com/apify/crawlee-python/commit/c62e5545de6a1836f0514ebd3dd695e4fd856844)) by [@janbuchar](https://github.com/janbuchar) + +### ๐Ÿ› Bug Fixes + +- Http clients detect 4xx as errors by default ([#498](https://github.com/apify/crawlee-python/pull/498)) ([1895dca](https://github.com/apify/crawlee-python/commit/1895dca538f415feca37b4a030525c7c0d32f114)) by [@vdusek](https://github.com/vdusek), closes [#496](https://github.com/apify/crawlee-python/issues/496) +- Correctly handle log level configuration ([#508](https://github.com/apify/crawlee-python/pull/508)) ([7ea8fe6](https://github.com/apify/crawlee-python/commit/7ea8fe69f4a6146a1e417bebff60c08a85e2ca27)) by [@janbuchar](https://github.com/janbuchar) + + +## [0.3.4](https://github.com/apify/crawlee-python/releases/tag/v0.3.4) (2024-09-05) + +### ๐Ÿ› Bug Fixes + +- Expose basic crawling context ([#501](https://github.com/apify/crawlee-python/pull/501)) ([b484535](https://github.com/apify/crawlee-python/commit/b484535dbacc5d206a026f55a1d3e58edd375e91)) by [@vdusek](https://github.com/vdusek) + + +## [0.3.3](https://github.com/apify/crawlee-python/releases/tag/v0.3.3) (2024-09-05) + +### ๐Ÿ› Bug Fixes + +- Deduplicate requests by unique key before submitting them to the queue ([#499](https://github.com/apify/crawlee-python/pull/499)) ([6a3e0e7](https://github.com/apify/crawlee-python/commit/6a3e0e78490851c43cefb0497ce34ca52a31a25c)) by [@janbuchar](https://github.com/janbuchar) + + +## [0.3.2](https://github.com/apify/crawlee-python/releases/tag/v0.3.2) (2024-09-02) + +### ๐Ÿ› Bug Fixes + +- Double incrementation of `item_count` ([#443](https://github.com/apify/crawlee-python/pull/443)) ([cd9adf1](https://github.com/apify/crawlee-python/commit/cd9adf15731e8c4a39cb142b6d1a62909cafdc51)) by [@cadlagtrader](https://github.com/cadlagtrader), closes [#442](https://github.com/apify/crawlee-python/issues/442) +- Field alias in `BatchRequestsOperationResponse` ([#485](https://github.com/apify/crawlee-python/pull/485)) ([126a862](https://github.com/apify/crawlee-python/commit/126a8629cb5b989a0f9fe22156fb09731a34acd2)) by [@janbuchar](https://github.com/janbuchar) +- JSON handling with Parsel ([#490](https://github.com/apify/crawlee-python/pull/490)) ([ebf5755](https://github.com/apify/crawlee-python/commit/ebf575539ffb631ae131a1b801cec8f21dd0cf4c)) by [@janbuchar](https://github.com/janbuchar), closes [#488](https://github.com/apify/crawlee-python/issues/488) + + +## [0.3.1](https://github.com/apify/crawlee-python/releases/tag/v0.3.1) (2024-08-30) + +### ๐Ÿš€ Features + +- Curl http client selects chrome impersonation by default ([#473](https://github.com/apify/crawlee-python/pull/473)) ([82dc939](https://github.com/apify/crawlee-python/commit/82dc93957b1a380ea975564dea5c6ba4639be548)) by [@vdusek](https://github.com/vdusek) + + +## [0.3.0](https://github.com/apify/crawlee-python/releases/tag/v0.3.0) (2024-08-27) + +### ๐Ÿš€ Features + +- Implement ParselCrawler that adds support for Parsel ([#348](https://github.com/apify/crawlee-python/pull/348)) ([a3832e5](https://github.com/apify/crawlee-python/commit/a3832e527f022f32cce4a80055da3b7967b74522)) by [@asymness](https://github.com/asymness), closes [#335](https://github.com/apify/crawlee-python/issues/335) +- Add support for filling a web form ([#453](https://github.com/apify/crawlee-python/pull/453)) ([5a125b4](https://github.com/apify/crawlee-python/commit/5a125b464b2619000b92dacad4c3a7faa1869f29)) by [@vdusek](https://github.com/vdusek), closes [#305](https://github.com/apify/crawlee-python/issues/305) + +### ๐Ÿ› Bug Fixes + +- Remove indentation from statistics logging and print the data in tables ([#322](https://github.com/apify/crawlee-python/pull/322)) ([359b515](https://github.com/apify/crawlee-python/commit/359b515d647f064886f91441c2c01d3099e21035)) by [@TymeeK](https://github.com/TymeeK), closes [#306](https://github.com/apify/crawlee-python/issues/306) +- Remove redundant log, fix format ([#408](https://github.com/apify/crawlee-python/pull/408)) ([8d27e39](https://github.com/apify/crawlee-python/commit/8d27e3928c605d6eceb51a948453a15024fa2aa2)) by [@janbuchar](https://github.com/janbuchar) +- Dequeue items from RequestQueue in the correct order ([#411](https://github.com/apify/crawlee-python/pull/411)) ([96fc33e](https://github.com/apify/crawlee-python/commit/96fc33e2cc4631cae3c50dad9eace6407103a2a9)) by [@janbuchar](https://github.com/janbuchar) +- Relative URLS supports & If not a URL, pass #417 ([#431](https://github.com/apify/crawlee-python/pull/431)) ([ccd8145](https://github.com/apify/crawlee-python/commit/ccd81454166ece68391cdffedb8efe9e663361d9)) by [@black7375](https://github.com/black7375), closes [#417](https://github.com/apify/crawlee-python/issues/417) +- Typo in ProlongRequestLockResponse ([#458](https://github.com/apify/crawlee-python/pull/458)) ([30ccc3a](https://github.com/apify/crawlee-python/commit/30ccc3a4763bc3706a3bbeaedc95f9648f5ba09a)) by [@janbuchar](https://github.com/janbuchar) +- Add missing __all__ to top-level __init__.py file ([#463](https://github.com/apify/crawlee-python/pull/463)) ([353a1ce](https://github.com/apify/crawlee-python/commit/353a1ce28cd38c97ffb36dc1e6b0e86d3aef1a48)) by [@janbuchar](https://github.com/janbuchar) + +### Refactor + +- [**breaking**] RequestQueue and service management rehaul ([#429](https://github.com/apify/crawlee-python/pull/429)) ([b155a9f](https://github.com/apify/crawlee-python/commit/b155a9f602a163e891777bef5608072fb5d0156f)) by [@janbuchar](https://github.com/janbuchar), closes [#83](https://github.com/apify/crawlee-python/issues/83), [#174](https://github.com/apify/crawlee-python/issues/174), [#203](https://github.com/apify/crawlee-python/issues/203), [#423](https://github.com/apify/crawlee-python/issues/423) +- [**breaking**] Declare private and public interface ([#456](https://github.com/apify/crawlee-python/pull/456)) ([d6738df](https://github.com/apify/crawlee-python/commit/d6738df30586934e8d1aba50b9cd437a0ea40400)) by [@vdusek](https://github.com/vdusek) + + +## [0.2.1](https://github.com/apify/crawlee-python/releases/tag/v0.2.1) (2024-08-05) + +### ๐Ÿ› Bug Fixes + +- Do not import curl impersonate in http clients init ([#396](https://github.com/apify/crawlee-python/pull/396)) ([3bb8009](https://github.com/apify/crawlee-python/commit/3bb80093e61c1615f869ecd5ab80b061e0e5db36)) by [@vdusek](https://github.com/vdusek) + + +## [0.2.0](https://github.com/apify/crawlee-python/releases/tag/v0.2.0) (2024-08-05) + +### ๐Ÿš€ Features + +- Add new curl impersonate HTTP client ([#387](https://github.com/apify/crawlee-python/pull/387)) ([9c06260](https://github.com/apify/crawlee-python/commit/9c06260c0ee958522caa9322001a3186e9e43af4)) by [@vdusek](https://github.com/vdusek), closes [#292](https://github.com/apify/crawlee-python/issues/292) +- **playwright:** `infinite_scroll` helper ([#393](https://github.com/apify/crawlee-python/pull/393)) ([34f74bd](https://github.com/apify/crawlee-python/commit/34f74bdcffb42a6c876a856e1c89923d9b3e60bd)) by [@janbuchar](https://github.com/janbuchar) + + +## [0.1.2](https://github.com/apify/crawlee-python/releases/tag/v0.1.2) (2024-07-30) + +### ๐Ÿš€ Features + +- Add URL validation ([#343](https://github.com/apify/crawlee-python/pull/343)) ([1514538](https://github.com/apify/crawlee-python/commit/15145388009c85ab54dc72ea8f2d07efd78f80fd)) by [@vdusek](https://github.com/vdusek), closes [#300](https://github.com/apify/crawlee-python/issues/300) + +### ๐Ÿ› Bug Fixes + +- Minor log fix ([#341](https://github.com/apify/crawlee-python/pull/341)) ([0688bf1](https://github.com/apify/crawlee-python/commit/0688bf1860534ab6b2a85dc850bf3d56507ab154)) by [@souravjain540](https://github.com/souravjain540) +- Also use error_handler for context pipeline errors ([#331](https://github.com/apify/crawlee-python/pull/331)) ([7a66445](https://github.com/apify/crawlee-python/commit/7a664456b45c7e429b4c90aaf1c09d5796b93e3d)) by [@janbuchar](https://github.com/janbuchar), closes [#296](https://github.com/apify/crawlee-python/issues/296) +- Strip whitespace from href in enqueue_links ([#346](https://github.com/apify/crawlee-python/pull/346)) ([8a3174a](https://github.com/apify/crawlee-python/commit/8a3174aed24f9eb4f9ac415a79a58685a081cde2)) by [@janbuchar](https://github.com/janbuchar), closes [#337](https://github.com/apify/crawlee-python/issues/337) +- Warn instead of crashing when an empty dataset is being exported ([#342](https://github.com/apify/crawlee-python/pull/342)) ([22b95d1](https://github.com/apify/crawlee-python/commit/22b95d1948d4acd23a010898fa6af2f491e7f514)) by [@janbuchar](https://github.com/janbuchar), closes [#334](https://github.com/apify/crawlee-python/issues/334) +- Avoid Github rate limiting in project bootstrapping test ([#364](https://github.com/apify/crawlee-python/pull/364)) ([992f07f](https://github.com/apify/crawlee-python/commit/992f07f266f7b8433d99e9a179f277995f81eb17)) by [@janbuchar](https://github.com/janbuchar) +- Pass crawler configuration to storages ([#375](https://github.com/apify/crawlee-python/pull/375)) ([b2d3a52](https://github.com/apify/crawlee-python/commit/b2d3a52712abe21f4a4a5db4e20c80afe72c27de)) by [@janbuchar](https://github.com/janbuchar) +- Purge request queue on repeated crawler runs ([#377](https://github.com/apify/crawlee-python/pull/377)) ([7ad3d69](https://github.com/apify/crawlee-python/commit/7ad3d6908e153c590bff72478af7ee3239a249bc)) by [@janbuchar](https://github.com/janbuchar), closes [#152](https://github.com/apify/crawlee-python/issues/152) + + +## [0.1.1](https://github.com/apify/crawlee-python/releases/tag/v0.1.1) (2024-07-19) + +### ๐Ÿš€ Features + +- Expose crawler log ([#316](https://github.com/apify/crawlee-python/pull/316)) ([ae475fa](https://github.com/apify/crawlee-python/commit/ae475fa450c4fe053620d7b7eb475f3d58804674)) by [@vdusek](https://github.com/vdusek), closes [#303](https://github.com/apify/crawlee-python/issues/303) +- Integrate proxies into `PlaywrightCrawler` ([#325](https://github.com/apify/crawlee-python/pull/325)) ([2e072b6](https://github.com/apify/crawlee-python/commit/2e072b6ad7d5d82d96a7b489cafb87e7bfaf6e83)) by [@vdusek](https://github.com/vdusek) +- Blocking detection for playwright crawler ([#328](https://github.com/apify/crawlee-python/pull/328)) ([49ff6e2](https://github.com/apify/crawlee-python/commit/49ff6e25c12a97550eee718d64bb4130f9990189)) by [@vdusek](https://github.com/vdusek), closes [#239](https://github.com/apify/crawlee-python/issues/239) + +### ๐Ÿ› Bug Fixes + +- Pylance reportPrivateImportUsage errors ([#313](https://github.com/apify/crawlee-python/pull/313)) ([09d7203](https://github.com/apify/crawlee-python/commit/09d72034d5db8c47f461111ec093761935a3e2ef)) by [@vdusek](https://github.com/vdusek), closes [#283](https://github.com/apify/crawlee-python/issues/283) +- Set httpx logging to warning ([#314](https://github.com/apify/crawlee-python/pull/314)) ([1585def](https://github.com/apify/crawlee-python/commit/1585defffb2c0c844fab39bbc0e0b793d6169cbf)) by [@vdusek](https://github.com/vdusek), closes [#302](https://github.com/apify/crawlee-python/issues/302) +- Byte size serialization in MemoryInfo ([#245](https://github.com/apify/crawlee-python/pull/245)) ([a030174](https://github.com/apify/crawlee-python/commit/a0301746c2df076d281708344fb906e1c42e0790)) by [@janbuchar](https://github.com/janbuchar) +- Project bootstrapping in existing folder ([#318](https://github.com/apify/crawlee-python/pull/318)) ([c630818](https://github.com/apify/crawlee-python/commit/c630818538e0c37217ab73f6c6da05505ed8b364)) by [@janbuchar](https://github.com/janbuchar), closes [#301](https://github.com/apify/crawlee-python/issues/301) + + +## [0.1.0](https://github.com/apify/crawlee-python/releases/tag/v0.1.0) (2024-07-08) + +### ๐Ÿš€ Features + +- Project templates ([#237](https://github.com/apify/crawlee-python/pull/237)) ([c23c12c](https://github.com/apify/crawlee-python/commit/c23c12c66688f825f74deb39702f07cc6c6bbc46)) by [@janbuchar](https://github.com/janbuchar), closes [#215](https://github.com/apify/crawlee-python/issues/215) + +### ๐Ÿ› Bug Fixes + +- CLI UX improvements ([#271](https://github.com/apify/crawlee-python/pull/271)) ([123d515](https://github.com/apify/crawlee-python/commit/123d515b224c663577bfe0fab387d0aa11e5e4d4)) by [@janbuchar](https://github.com/janbuchar), closes [#267](https://github.com/apify/crawlee-python/issues/267) +- Error handling in CLI and templates documentation ([#273](https://github.com/apify/crawlee-python/pull/273)) ([61083c3](https://github.com/apify/crawlee-python/commit/61083c33434d431a118538f15bfa9a68c312ab03)) by [@vdusek](https://github.com/vdusek), closes [#268](https://github.com/apify/crawlee-python/issues/268) + + +## [0.0.7](https://github.com/apify/crawlee-python/releases/tag/v0.0.7) (2024-06-27) + +### ๐Ÿ› Bug Fixes + +- Do not wait for consistency in request queue ([#235](https://github.com/apify/crawlee-python/pull/235)) ([03ff138](https://github.com/apify/crawlee-python/commit/03ff138aadaf8e915abc7fafb854fe12947b9696)) by [@vdusek](https://github.com/vdusek) +- Selector handling in BeautifulSoupCrawler enqueue_links ([#231](https://github.com/apify/crawlee-python/pull/231)) ([896501e](https://github.com/apify/crawlee-python/commit/896501edb44f801409fec95cb3e5f2bcfcb4188d)) by [@janbuchar](https://github.com/janbuchar), closes [#230](https://github.com/apify/crawlee-python/issues/230) +- Handle blocked request ([#234](https://github.com/apify/crawlee-python/pull/234)) ([f8ef79f](https://github.com/apify/crawlee-python/commit/f8ef79ffcb7410713182af716d37dbbaad66fdbc)) by [@Mantisus](https://github.com/Mantisus) +- Improve AutoscaledPool state management ([#241](https://github.com/apify/crawlee-python/pull/241)) ([fdea3d1](https://github.com/apify/crawlee-python/commit/fdea3d16b13afe70039d864de861486c760aa0ba)) by [@janbuchar](https://github.com/janbuchar), closes [#236](https://github.com/apify/crawlee-python/issues/236) + + +## [0.0.6](https://github.com/apify/crawlee-python/releases/tag/v0.0.6) (2024-06-25) + +### ๐Ÿš€ Features + +- Maintain a global configuration instance ([#207](https://github.com/apify/crawlee-python/pull/207)) ([e003aa6](https://github.com/apify/crawlee-python/commit/e003aa63d859bec8199d0c890b5c9604f163ccd3)) by [@janbuchar](https://github.com/janbuchar) +- Add max requests per crawl to `BasicCrawler` ([#198](https://github.com/apify/crawlee-python/pull/198)) ([b5b3053](https://github.com/apify/crawlee-python/commit/b5b3053f43381601274e4034d07b4bf41720c7c2)) by [@vdusek](https://github.com/vdusek) +- Add support decompress *br* response content ([#226](https://github.com/apify/crawlee-python/pull/226)) ([a3547b9](https://github.com/apify/crawlee-python/commit/a3547b9c882dc5333a4fcd1223687ef85e79138d)) by [@Mantisus](https://github.com/Mantisus) +- BasicCrawler.export_data helper ([#222](https://github.com/apify/crawlee-python/pull/222)) ([237ec78](https://github.com/apify/crawlee-python/commit/237ec789b7dccc17cc57ef47ec56bcf73c6ca006)) by [@janbuchar](https://github.com/janbuchar), closes [#211](https://github.com/apify/crawlee-python/issues/211) +- Automatic logging setup ([#229](https://github.com/apify/crawlee-python/pull/229)) ([a67b72f](https://github.com/apify/crawlee-python/commit/a67b72faacd75674071bae496d59e1c60636350c)) by [@janbuchar](https://github.com/janbuchar), closes [#214](https://github.com/apify/crawlee-python/issues/214) + +### ๐Ÿ› Bug Fixes + +- Handling of relative URLs in add_requests ([#213](https://github.com/apify/crawlee-python/pull/213)) ([8aa8c57](https://github.com/apify/crawlee-python/commit/8aa8c57f44149caa0e01950a5d773726f261699a)) by [@janbuchar](https://github.com/janbuchar), closes [#202](https://github.com/apify/crawlee-python/issues/202), [#204](https://github.com/apify/crawlee-python/issues/204) +- Graceful exit in BasicCrawler.run ([#224](https://github.com/apify/crawlee-python/pull/224)) ([337286e](https://github.com/apify/crawlee-python/commit/337286e1b721cf61f57bc0ff3ead08df1f4f5448)) by [@janbuchar](https://github.com/janbuchar), closes [#212](https://github.com/apify/crawlee-python/issues/212) + + +## [0.0.5](https://github.com/apify/crawlee-python/releases/tag/v0.0.5) (2024-06-21) + +### ๐Ÿš€ Features + +- Browser rotation and better browser abstraction ([#177](https://github.com/apify/crawlee-python/pull/177)) ([a42ae6f](https://github.com/apify/crawlee-python/commit/a42ae6f53c5e24678f04011c3684290b68684016)) by [@vdusek](https://github.com/vdusek), closes [#131](https://github.com/apify/crawlee-python/issues/131) +- Add emit persist state event to event manager ([#181](https://github.com/apify/crawlee-python/pull/181)) ([97f6c68](https://github.com/apify/crawlee-python/commit/97f6c68275b65f76c62b6d16d94354fc7f00d336)) by [@vdusek](https://github.com/vdusek) +- Batched request addition in RequestQueue ([#186](https://github.com/apify/crawlee-python/pull/186)) ([f48c806](https://github.com/apify/crawlee-python/commit/f48c8068fe16ce3dd4c46fc248733346c0621411)) by [@vdusek](https://github.com/vdusek) +- Add storage helpers to crawler & context ([#192](https://github.com/apify/crawlee-python/pull/192)) ([f8f4066](https://github.com/apify/crawlee-python/commit/f8f4066d8b32d6e7dc0d999a5aa8db75f99b43b8)) by [@vdusek](https://github.com/vdusek), closes [#98](https://github.com/apify/crawlee-python/issues/98), [#100](https://github.com/apify/crawlee-python/issues/100), [#172](https://github.com/apify/crawlee-python/issues/172) +- Handle all supported configuration options ([#199](https://github.com/apify/crawlee-python/pull/199)) ([23c901c](https://github.com/apify/crawlee-python/commit/23c901cd68cf14b4041ee03568622ee32822e94b)) by [@janbuchar](https://github.com/janbuchar), closes [#84](https://github.com/apify/crawlee-python/issues/84) +- Add Playwright's enqueue links helper ([#196](https://github.com/apify/crawlee-python/pull/196)) ([849d73c](https://github.com/apify/crawlee-python/commit/849d73cc7d137171b98f9f2ab85374e8beec0dad)) by [@vdusek](https://github.com/vdusek) + +### ๐Ÿ› Bug Fixes + +- Tmp path in tests is working ([#164](https://github.com/apify/crawlee-python/pull/164)) ([382b6f4](https://github.com/apify/crawlee-python/commit/382b6f48174bdac3931cc379eaf770ab06f826dc)) by [@vdusek](https://github.com/vdusek), closes [#159](https://github.com/apify/crawlee-python/issues/159) +- Add explicit err msgs for missing pckg extras during import ([#165](https://github.com/apify/crawlee-python/pull/165)) ([200ebfa](https://github.com/apify/crawlee-python/commit/200ebfa63d6e20e17c8ca29544ef7229ed0df308)) by [@vdusek](https://github.com/vdusek), closes [#155](https://github.com/apify/crawlee-python/issues/155) +- Make timedelta_ms accept string-encoded numbers ([#190](https://github.com/apify/crawlee-python/pull/190)) ([d8426ff](https://github.com/apify/crawlee-python/commit/d8426ff41e36f701af459ad17552fee39637674d)) by [@janbuchar](https://github.com/janbuchar) +- **deps:** Update dependency psutil to v6 ([#193](https://github.com/apify/crawlee-python/pull/193)) ([eb91f51](https://github.com/apify/crawlee-python/commit/eb91f51e19da406e3f9293e5336c1f85fc7885a4)) by [@renovate[bot]](https://github.com/renovate[bot]) +- Improve compatibility between ProxyConfiguration and its SDK counterpart ([#201](https://github.com/apify/crawlee-python/pull/201)) ([1a76124](https://github.com/apify/crawlee-python/commit/1a76124080d561e0153a4dda0bdb0d9863c3aab6)) by [@janbuchar](https://github.com/janbuchar) +- Correct return type of storage get_info methods ([#200](https://github.com/apify/crawlee-python/pull/200)) ([332673c](https://github.com/apify/crawlee-python/commit/332673c4fb519b80846df7fb8cd8bb521538a8a4)) by [@janbuchar](https://github.com/janbuchar) +- Type error in statistics persist state ([#206](https://github.com/apify/crawlee-python/pull/206)) ([96ceef6](https://github.com/apify/crawlee-python/commit/96ceef697769cd57bd1a50b6615cf1e70549bd2d)) by [@vdusek](https://github.com/vdusek), closes [#194](https://github.com/apify/crawlee-python/issues/194) + + +## [0.0.4](https://github.com/apify/crawlee-python/releases/tag/v0.0.4) (2024-05-30) + +### ๐Ÿš€ Features + +- Capture statistics about the crawler run ([#142](https://github.com/apify/crawlee-python/pull/142)) ([eeebe9b](https://github.com/apify/crawlee-python/commit/eeebe9b1e24338d68a0a55228bbfc717f4d9d295)) by [@janbuchar](https://github.com/janbuchar), closes [#97](https://github.com/apify/crawlee-python/issues/97) +- Proxy configuration ([#156](https://github.com/apify/crawlee-python/pull/156)) ([5c3753a](https://github.com/apify/crawlee-python/commit/5c3753a5527b1d01f7260b9e4c566e43f956a5e8)) by [@janbuchar](https://github.com/janbuchar), closes [#136](https://github.com/apify/crawlee-python/issues/136) +- Add first version of browser pool and playwright crawler ([#161](https://github.com/apify/crawlee-python/pull/161)) ([2d2a050](https://github.com/apify/crawlee-python/commit/2d2a0505b1c2b1529a8835163ca97d1ec2a6e44a)) by [@vdusek](https://github.com/vdusek) + + +## [0.0.3](https://github.com/apify/crawlee-python/releases/tag/v0.0.3) (2024-05-13) + +### ๐Ÿš€ Features + +- AutoscaledPool implementation ([#55](https://github.com/apify/crawlee-python/pull/55)) ([621ada2](https://github.com/apify/crawlee-python/commit/621ada2bd1ba4e2346fb948dc02686e2b37e3856)) by [@janbuchar](https://github.com/janbuchar), closes [#19](https://github.com/apify/crawlee-python/issues/19) +- Add Snapshotter ([#20](https://github.com/apify/crawlee-python/pull/20)) ([492ee38](https://github.com/apify/crawlee-python/commit/492ee38c893b8f54e9583dd492576c5106e29881)) by [@vdusek](https://github.com/vdusek) +- Implement BasicCrawler ([#56](https://github.com/apify/crawlee-python/pull/56)) ([6da971f](https://github.com/apify/crawlee-python/commit/6da971fcddbf8b6795346c88e295dada28e7b1d3)) by [@janbuchar](https://github.com/janbuchar), closes [#30](https://github.com/apify/crawlee-python/issues/30) +- BeautifulSoupCrawler ([#107](https://github.com/apify/crawlee-python/pull/107)) ([4974dfa](https://github.com/apify/crawlee-python/commit/4974dfa20c7911ee073438fd388e60ba4b2c07db)) by [@janbuchar](https://github.com/janbuchar), closes [#31](https://github.com/apify/crawlee-python/issues/31) +- Add_requests and enqueue_links context helpers ([#120](https://github.com/apify/crawlee-python/pull/120)) ([dc850a5](https://github.com/apify/crawlee-python/commit/dc850a5778b105ff09e19eaecbb0a12d94798a62)) by [@janbuchar](https://github.com/janbuchar), closes [#5](https://github.com/apify/crawlee-python/issues/5) +- Use SessionPool in BasicCrawler ([#128](https://github.com/apify/crawlee-python/pull/128)) ([9fc4648](https://github.com/apify/crawlee-python/commit/9fc464837e596b3b5a7cd818b6d617550e249352)) by [@janbuchar](https://github.com/janbuchar), closes [#110](https://github.com/apify/crawlee-python/issues/110) +- Add base storage client and resource subclients ([#138](https://github.com/apify/crawlee-python/pull/138)) ([44d6597](https://github.com/apify/crawlee-python/commit/44d65974e4837576918069d7e63f8b804964971a)) by [@vdusek](https://github.com/vdusek) + +### ๐Ÿ› Bug Fixes + +- **deps:** Update dependency docutils to ^0.21.0 ([#101](https://github.com/apify/crawlee-python/pull/101)) ([534b613](https://github.com/apify/crawlee-python/commit/534b613f7cdfe7adf38b548ee48537db3167d1ec)) by [@renovate[bot]](https://github.com/renovate[bot]) +- **deps:** Update dependency eval-type-backport to ^0.2.0 ([#124](https://github.com/apify/crawlee-python/pull/124)) ([c9e69a8](https://github.com/apify/crawlee-python/commit/c9e69a8534f4d82d9a6314947d76a86bcb744607)) by [@renovate[bot]](https://github.com/renovate[bot]) +- Fire local SystemInfo events every second ([#144](https://github.com/apify/crawlee-python/pull/144)) ([f1359fa](https://github.com/apify/crawlee-python/commit/f1359fa7eea23f8153ad711287c073e45d498401)) by [@vdusek](https://github.com/vdusek) +- Storage manager & purging the defaults ([#150](https://github.com/apify/crawlee-python/pull/150)) ([851042f](https://github.com/apify/crawlee-python/commit/851042f25ad07e25651768e476f098ef0ed21914)) by [@vdusek](https://github.com/vdusek) + + + \ No newline at end of file diff --git a/website/versioned_docs/version-0.6/deployment/apify_platform.mdx b/website/versioned_docs/version-0.6/deployment/apify_platform.mdx new file mode 100644 index 0000000000..fc09127ea9 --- /dev/null +++ b/website/versioned_docs/version-0.6/deployment/apify_platform.mdx @@ -0,0 +1,253 @@ +--- +id: apify-platform +title: Apify platform +description: Apify platform - large-scale and high-performance web scraping +--- + +import ApiLink from '@site/src/components/ApiLink'; + +import CodeBlock from '@theme/CodeBlock'; + +import LogWithConfigExample from '!!raw-loader!./code_examples/apify/log_with_config_example.py'; +import CrawlerAsActorExample from '!!raw-loader!./code_examples/apify/crawler_as_actor_example.py'; +import ProxyExample from '!!raw-loader!./code_examples/apify/proxy_example.py'; +import ProxyAdvancedExample from '!!raw-loader!./code_examples/apify/proxy_advanced_example.py'; + +Apify is a [platform](https://apify.com) built to serve large-scale and high-performance web scraping and automation needs. It provides easy access to [compute instances (Actors)](#what-is-an-actor), convenient request and result storages, [proxies](../guides/proxy-management), scheduling, webhooks and [more](https://docs.apify.com/), accessible through a [web interface](https://console.apify.com) or an [API](https://docs.apify.com/api). + +While we think that the Apify platform is super cool, and it's definitely worth signing up for a [free account](https://console.apify.com/sign-up), **Crawlee is and will always be open source**, runnable locally or on any cloud infrastructure. + +:::note + +We do not test Crawlee in other cloud environments such as Lambda or on specific architectures such as Raspberry PI. We strive to make it work, but there are no guarantees. + +::: + +## Requirements + +To run your Crawlee code on Apify platform, you need an Apify account. If you don't have one yet, you can sign up [here](https://console.apify.com/sign-up). + +Additionally, you must have the [Apify CLI](https://docs.apify.com/cli/) installed on your computer. For installation instructions, refer to the [Installation guide](https://docs.apify.com/cli/docs/installation). + +Finally, ensure that the [Apify SDK] (https://docs.apify.com/sdk/python/) is installed in your project. You can install it using `pip`: + +```bash +pip install apify +``` + +## Logging into Apify platform from Crawlee + +To access your [Apify account](https://console.apify.com/sign-up) from Crawlee, you must provide credentials - your [API token](https://console.apify.com/account?tab=integrations). You can do that either by utilizing [Apify CLI](https://docs.apify.com/cli/) or with environment variables. + +Once you provide credentials to your Apify CLI installation, you will be able to use all the Apify platform features, such as calling Actors, saving to cloud storages, using Apify proxies, setting up webhooks and so on. + +### Log in with CLI + +Apify CLI allows you to log in to your Apify account on your computer. If you then run your crawler using the CLI, your credentials will automatically be added. + +```bash +npm install -g apify-cli +apify login -t YOUR_API_TOKEN +``` + +### Log in with environment variables + +Alternatively, you can always provide credentials to your Actor by setting the [`APIFY_TOKEN`](#apify_token) environment variable to your API token. + +> There's also the [`APIFY_PROXY_PASSWORD`](#apify_proxy_password) +> environment variable. Actor automatically infers that from your token, but it can be useful +> when you need to access proxies from a different account than your token represents. + +### Log in with Configuration + +Another option is to use the [`Configuration`](https://docs.apify.com/sdk/python/reference/class/Configuration) instance and set your api token there. + + + {LogWithConfigExample} + + +## What is an Actor + +When you deploy your script to the Apify platform, it becomes an [Actor](https://apify.com/actors). An Actor is a serverless microservice that accepts an input and produces an output. It can run for a few seconds, hours or even infinitely. An Actor can perform anything from a simple action such as filling out a web form or sending an email, to complex operations such as crawling an entire website and removing duplicates from a large dataset. + +Actors can be shared in the [Apify Store](https://apify.com/store) so that other people can use them. But don't worry, if you share your Actor in the store and somebody uses it, it runs under their account, not yours. + +**Related links** + +- [Store of existing Actors](https://apify.com/store) +- [Documentation](https://docs.apify.com/actors) +- [View Actors in Apify Console](https://console.apify.com/actors) +- [API reference](https://apify.com/docs/api/v2#/reference/actors) + +## Running an Actor locally + +First let's create a boilerplate of the new Actor. You could use Apify CLI and just run: + +```bash +apify create my-hello-world +``` + +The CLI will prompt you to select a project boilerplate template - let's pick "Crawlee + BeautifulSoup". The tool will create a directory called `my-hello-world` with Python project files. You can run the Actor as follows: + +```bash +cd my-hello-world +apify run +``` + +## Running Crawlee code as an Actor + +For running Crawlee code as an Actor on [Apify platform](https://apify.com/actors) you need to wrap the body of the main function of your crawler with `async with Actor`. + +:::info NOTE +Adding `async with Actor` is the only important thing needed to run it on Apify platform as an Actor. It is needed to initialize your Actor (e.g. to set the correct storage implementation) and to correctly handle exiting the process. +::: + +Let's look at the `BeautifulSoupCrawler` example from the [Quick start](../quick-start) guide: + + + {CrawlerAsActorExample} + + +Note that you could also run your Actor (that is using Crawlee) locally with Apify CLI. You could start it via the following command in your project folder: + +```bash +apify run +``` + +## Deploying an Actor to Apify platform + +Now (assuming you are already logged in to your Apify account) you can easily deploy your code to the Apify platform by running: + +```bash +apify push +``` + +Your script will be uploaded to and built on the Apify platform so that it can be run there. For more information, view the +[Apify Actor](https://docs.apify.com/cli) documentation. + +## Usage on Apify platform + +You can also develop your Actor in an online code editor directly on the platform (you'll need an Apify Account). Let's go to the [Actors](https://console.apify.com/actors) page in the app, click *Create new* and then go to the *Source* tab and start writing the code or paste one of the examples from the [Examples](../examples) section. + +## Storages + +There are several things worth mentioning here. + +### Helper functions for default Key-Value Store and Dataset + +To simplify access to the _default_ storages, instead of using the helper functions of respective storage classes, you could use: +- [`Actor.set_value()`](https://docs.apify.com/sdk/python/reference/class/Actor#set_value), [`Actor.get_value()`](https://docs.apify.com/sdk/python/reference/class/Actor#get_value), [`Actor.get_input()`](https://docs.apify.com/sdk/python/reference/class/Actor#get_input) for [`Key-Value Store`](https://docs.apify.com/sdk/python/reference/class/KeyValueStore) +- [`Actor.push_data()`](https://docs.apify.com/sdk/python/reference/class/Actor#push_data) for [`Dataset`](https://docs.apify.com/sdk/python/reference/class/Dataset) + +### Using platform storage in a local Actor + +When you plan to use the platform storage while developing and running your Actor locally, you should use [`Actor.open_key_value_store()`](https://docs.apify.com/sdk/python/reference/class/Actor#open_key_value_store), [`Actor.open_dataset()`](https://docs.apify.com/sdk/python/reference/class/Actor#open_dataset) and [`Actor.open_request_queue()`](https://docs.apify.com/sdk/python/reference/class/Actor#open_request_queue) to open the respective storage. + +Using each of these methods allows to pass the `force_cloud` keyword argument. If set to `True`, cloud storage will be used instead of the folder on the local disk. + +:::note +If you don't plan to force usage of the platform storages when running the Actor locally, there is no need to use the [`Actor`](https://docs.apify.com/sdk/python/reference/class/Actor) class for it. The Crawlee variants `KeyValueStore.open()`, `Dataset.open()` and `RequestQueue.open()` will work the same. +::: + +{/* +### Getting public url of an item in the platform storage + +If you need to share a link to some file stored in a [Key-Value](https://docs.apify.com/sdk/python/reference/class/KeyValueStore) Store on Apify platform, you can use [`get_public_url()`](https://docs.apify.com/sdk/python/reference/class/KeyValueStore#get_public_url) method. It accepts only one parameter: `key` - the key of the item you want to share. + + + {GetPublicUrlSource} + + +*/} + +### Exporting dataset data + +When the `Dataset` is stored on the [Apify platform](https://apify.com/actors), you can export its data to the following formats: HTML, JSON, CSV, Excel, XML and RSS. The datasets are displayed on the Actor run details page and in the [Storage](https://console.apify.com/storage) section in the Apify Console. The actual data is exported using the [Get dataset items](https://apify.com/docs/api/v2#/reference/datasets/item-collection/get-items) Apify API endpoint. This way you can easily share the crawling results. + +**Related links** + +- [Apify platform storage documentation](https://docs.apify.com/storage) +- [View storage in Apify Console](https://console.apify.com/storage) +- [Key-value stores API reference](https://apify.com/docs/api/v2#/reference/key-value-stores) +- [Datasets API reference](https://docs.apify.com/api/v2#/reference/datasets) +- [Request queues API reference](https://docs.apify.com/api/v2#/reference/request-queues) + +## Environment variables + +The following describes select environment variables set by the Apify platform. For a complete list, see the [Environment variables](https://docs.apify.com/platform/actors/development/programming-interface/environment-variables) section in the Apify platform documentation. + +:::note + +It's important to notice that `CRAWLEE_` environment variables don't need to be replaced with equivalent `APIFY_` ones. Likewise, Crawlee understands `APIFY_` environment variables. + +::: + +### `APIFY_TOKEN` + +The API token for your Apify account. It is used to access the Apify API, e.g. to access cloud storage +or to run an Actor on the Apify platform. You can find your API token on the +[Account Settings / Integrations](https://console.apify.com/account?tab=integrations) page. + +### Combinations of `APIFY_TOKEN` and `CRAWLEE_STORAGE_DIR` + +By combining the env vars in various ways, you can greatly influence the Actor's behavior. + +| Env Vars | API | Storages | +| --------------------------------------- | --- | ---------------- | +| none OR `CRAWLEE_STORAGE_DIR` | no | local | +| `APIFY_TOKEN` | yes | Apify platform | +| `APIFY_TOKEN` AND `CRAWLEE_STORAGE_DIR` | yes | local + platform | + +When using both `APIFY_TOKEN` and `CRAWLEE_STORAGE_DIR`, you can use all the Apify platform +features and your data will be stored locally by default. If you want to access platform storages, +you can use the `force_cloud=true` option in their respective functions. + +### `APIFY_PROXY_PASSWORD` + +Optional password to [Apify Proxy](https://docs.apify.com/proxy) for IP address rotation. +Assuming Apify Account was already created, you can find the password on the [Proxy page](https://console.apify.com/proxy) +in the Apify Console. The password is automatically inferred using the `APIFY_TOKEN` env var, +so in most cases, you don't need to touch it. You should use it when, for some reason, +you need access to Apify Proxy, but not access to Apify API, or when you need access to +proxy from a different account than your token represents. + +## Proxy management + +In addition to your own proxy servers and proxy servers acquired from +third-party providers used together with Crawlee, you can also rely on [Apify Proxy](https://apify.com/proxy) +for your scraping needs. + +### Apify proxy + +If you are already subscribed to Apify Proxy, you can start using them immediately in only a few lines of code (for local usage you first should be [logged in](#logging-into-apify-platform-from-crawlee) to your Apify account. + + + {ProxyExample} + + +Note that unlike using your own proxies in Crawlee, you shouldn't use the constructor to create `ProxyConfiguration` instances. For using the Apify Proxy you should create an instance using the [`Actor.create_proxy_configuration()`](https://docs.apify.com/sdk/python/reference/class/Actor#create_proxy_configuration) function instead. + +### Advanced Apify proxy configuration + +With Apify Proxy, you can select specific proxy groups to use, or countries to connect from. +This allows you to get better proxy performance after some initial research. + + + {ProxyAdvancedExample} + + +Now your crawlers will use only Residential proxies from the US. Note that you must first get access +to a proxy group before you are able to use it. You can check proxy groups available to you +in the [proxy dashboard](https://console.apify.com/proxy). + +### Apify proxy vs. own proxies + +The [`ProxyConfiguration`](https://docs.apify.com/sdk/python/reference/class/ProxyConfiguration) class covers both Apify Proxy and custom proxy URLs so that you can easily switch between proxy providers. However, some features of the class are available only to Apify Proxy users, mainly because Apify Proxy is what one would call a super-proxy. It's not a single proxy server, but an API endpoint that allows connection through millions of different IP addresses. So the class essentially has two modes: Apify Proxy or Own (third party) proxy. + +The difference is easy to remember. +- If you're using your own proxies - you should create a `ProxyConfiguration` instance directly. +- If you are planning to use Apify Proxy - you should create an instance using the [`Actor.create_proxy_configuration()`](https://docs.apify.com/sdk/python/reference/class/Actor#create_proxy_configuration) function. The `new_url_function` parameter enables the use of your custom proxy URLs, whereas all the other options are there to configure Apify Proxy. + +**Related links** + +- [Apify Proxy docs](https://docs.apify.com/proxy) diff --git a/website/versioned_docs/version-0.6/deployment/code_examples/apify/crawler_as_actor_example.py b/website/versioned_docs/version-0.6/deployment/code_examples/apify/crawler_as_actor_example.py new file mode 100644 index 0000000000..53527d555b --- /dev/null +++ b/website/versioned_docs/version-0.6/deployment/code_examples/apify/crawler_as_actor_example.py @@ -0,0 +1,27 @@ +import asyncio + +from apify import Actor + +from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext + + +async def main() -> None: + # Wrap the crawler code in an Actor context manager. + async with Actor: + crawler = BeautifulSoupCrawler(max_requests_per_crawl=10) + + @crawler.router.default_handler + async def request_handler(context: BeautifulSoupCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + data = { + 'url': context.request.url, + 'title': context.soup.title.string if context.soup.title else None, + } + await context.push_data(data) + await context.enqueue_links() + + await crawler.run(['https://crawlee.dev']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/deployment/code_examples/apify/get_public_url.py b/website/versioned_docs/version-0.6/deployment/code_examples/apify/get_public_url.py new file mode 100644 index 0000000000..d12cfba300 --- /dev/null +++ b/website/versioned_docs/version-0.6/deployment/code_examples/apify/get_public_url.py @@ -0,0 +1,16 @@ +import asyncio + +from apify import Actor + + +async def main() -> None: + async with Actor: + store = await Actor.open_key_value_store() + await store.set_value('your-file', {'foo': 'bar'}) + url = store.get_public_url('your-file') + Actor.log.info(f'KVS public URL: {url}') + # https://api.apify.com/v2/key-value-stores//records/your-file + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/deployment/code_examples/apify/log_with_config_example.py b/website/versioned_docs/version-0.6/deployment/code_examples/apify/log_with_config_example.py new file mode 100644 index 0000000000..dfefa7b5ae --- /dev/null +++ b/website/versioned_docs/version-0.6/deployment/code_examples/apify/log_with_config_example.py @@ -0,0 +1,19 @@ +import asyncio + +from apify import Actor, Configuration + + +async def main() -> None: + # Create a new configuration with your API key. You can find it at + # https://console.apify.com/settings/integrations. It can be provided either + # as a parameter "token" or as an environment variable "APIFY_TOKEN". + config = Configuration( + token='apify_api_YOUR_TOKEN', + ) + + async with Actor(config): + Actor.log.info('Hello from Apify platform!') + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/deployment/code_examples/apify/proxy_advanced_example.py b/website/versioned_docs/version-0.6/deployment/code_examples/apify/proxy_advanced_example.py new file mode 100644 index 0000000000..1b5306bd39 --- /dev/null +++ b/website/versioned_docs/version-0.6/deployment/code_examples/apify/proxy_advanced_example.py @@ -0,0 +1,20 @@ +import asyncio + +from apify import Actor + + +async def main() -> None: + async with Actor: + proxy_configuration = await Actor.create_proxy_configuration( + password='apify_proxy_YOUR_PASSWORD', + # Specify the proxy group to use. + groups=['RESIDENTIAL'], + # Set the country code for the proxy. + country_code='US', + ) + + # ... + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/deployment/code_examples/apify/proxy_example.py b/website/versioned_docs/version-0.6/deployment/code_examples/apify/proxy_example.py new file mode 100644 index 0000000000..d546c5cc45 --- /dev/null +++ b/website/versioned_docs/version-0.6/deployment/code_examples/apify/proxy_example.py @@ -0,0 +1,24 @@ +import asyncio + +from apify import Actor + + +async def main() -> None: + async with Actor: + # Create a new Apify Proxy configuration. The password can be found at + # https://console.apify.com/proxy/http-settings and should be provided either + # as a parameter "password" or as an environment variable "APIFY_PROXY_PASSWORD". + proxy_configuration = await Actor.create_proxy_configuration( + password='apify_proxy_YOUR_PASSWORD', + ) + + if not proxy_configuration: + Actor.log.warning('Failed to create proxy configuration.') + return + + proxy_url = await proxy_configuration.new_url() + Actor.log.info(f'Proxy URL: {proxy_url}') + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/deployment/code_examples/google/cloud_run_example.py b/website/versioned_docs/version-0.6/deployment/code_examples/google/cloud_run_example.py new file mode 100644 index 0000000000..c01a4f3821 --- /dev/null +++ b/website/versioned_docs/version-0.6/deployment/code_examples/google/cloud_run_example.py @@ -0,0 +1,55 @@ +# mypy: disable-error-code="misc" +import json +import os + +import uvicorn +from litestar import Litestar, get + +from crawlee import service_locator +from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext + +# highlight-start +# Disable writing storage data to the file system +configuration = service_locator.get_configuration() +configuration.persist_storage = False +configuration.write_metadata = False +# highlight-end + + +@get('/') +async def main() -> str: + """The crawler entry point that will be called when the HTTP endpoint is accessed.""" + crawler = PlaywrightCrawler( + headless=True, + max_requests_per_crawl=10, + browser_type='firefox', + ) + + @crawler.router.default_handler + async def default_handler(context: PlaywrightCrawlingContext) -> None: + """Default request handler that processes each page during crawling.""" + context.log.info(f'Processing {context.request.url} ...') + title = await context.page.query_selector('title') + await context.push_data( + { + 'url': context.request.loaded_url, + 'title': await title.inner_text() if title else None, + } + ) + + await context.enqueue_links() + + await crawler.run(['https://crawlee.dev']) + + data = await crawler.get_data() + + # Return the results as JSON to the client + return json.dumps(data.items) + + +# Initialize the Litestar app with our route handler +app = Litestar(route_handlers=[main]) + +# Start the Uvicorn server using the `PORT` environment variable provided by GCP +# This is crucial - Cloud Run expects your app to listen on this specific port +uvicorn.run(app, host='0.0.0.0', port=int(os.environ.get('PORT', '8080'))) # noqa: S104 # Use all interfaces in a container, safely diff --git a/website/versioned_docs/version-0.6/deployment/code_examples/google/google_example.py b/website/versioned_docs/version-0.6/deployment/code_examples/google/google_example.py new file mode 100644 index 0000000000..f7180aa417 --- /dev/null +++ b/website/versioned_docs/version-0.6/deployment/code_examples/google/google_example.py @@ -0,0 +1,62 @@ +# mypy: disable-error-code="misc" +import asyncio +import json +from datetime import timedelta + +import functions_framework +from flask import Request, Response + +from crawlee import service_locator +from crawlee.crawlers import ( + BeautifulSoupCrawler, + BeautifulSoupCrawlingContext, +) + +# highlight-start +# Disable writing storage data to the file system +configuration = service_locator.get_configuration() +configuration.persist_storage = False +configuration.write_metadata = False +# highlight-end + + +async def main() -> str: + crawler = BeautifulSoupCrawler( + max_request_retries=1, + request_handler_timeout=timedelta(seconds=30), + max_requests_per_crawl=10, + ) + + @crawler.router.default_handler + async def request_handler(context: BeautifulSoupCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + data = { + 'url': context.request.url, + 'title': context.soup.title.string if context.soup.title else None, + 'h1s': [h1.text for h1 in context.soup.find_all('h1')], + 'h2s': [h2.text for h2 in context.soup.find_all('h2')], + 'h3s': [h3.text for h3 in context.soup.find_all('h3')], + } + + await context.push_data(data) + await context.enqueue_links() + + await crawler.run(['https://crawlee.dev']) + + # highlight-start + # Extract data saved in `Dataset` + data = await crawler.get_data() + # Serialize to json string and return + return json.dumps(data.items) + # highlight-end + + +@functions_framework.http +def crawlee_run(request: Request) -> Response: + # You can pass data to your crawler using `request` + function_id = request.headers['Function-Execution-Id'] + response_str = asyncio.run(main()) + + # Return a response with the crawling results + return Response(response=response_str, status=200) diff --git a/website/versioned_docs/version-0.6/deployment/google_cloud.mdx b/website/versioned_docs/version-0.6/deployment/google_cloud.mdx new file mode 100644 index 0000000000..e4f1fbe480 --- /dev/null +++ b/website/versioned_docs/version-0.6/deployment/google_cloud.mdx @@ -0,0 +1,45 @@ +--- +id: gcp-cloud-run-functions +title: Cloud Run functions +description: Prepare your crawler to run in Cloud Run functions on Google Cloud Platform. +--- + +import ApiLink from '@site/src/components/ApiLink'; + +import CodeBlock from '@theme/CodeBlock'; + +import GoogleFunctions from '!!raw-loader!./code_examples/google/google_example.py'; + +[Google Cloud Run Functions](https://cloud.google.com/functions) is a serverless execution environment for running simple HTTP-based web scrapers. This service is best suited for lightweight crawlers that don't require browser rendering capabilities and can be executed via HTTP requests. + +## Updating the project + +For the project foundation, use BeautifulSoupCrawler as described in this [example](../examples/beautifulsoup-crawler). + +Add [`functions-framework`](https://pypi.org/project/functions-framework/) to your dependencies file `requirements.txt`. If you're using a project manager like `poetry` or `uv`, export your dependencies to `requirements.txt`. + +Update the project code to make it compatible with Cloud Functions and return data in JSON format. Also add an entry point that Cloud Functions will use to run the project. + + + {GoogleFunctions.replace(/^.*?\n/, '')} + + +You can test your project locally. Start the server by running: + +```bash +functions-framework --target=crawlee_run +``` + +Then make a GET request to `http://127.0.0.1:8080/`, for example in your browser. + +## Deploying to Google Cloud Platform + +In the Google Cloud dashboard, create a new function, allocate memory and CPUs to it, set region and function timeout. + +When deploying, select **"Use an inline editor to create a function"**. This allows you to configure the project using only the Google Cloud Console dashboard. + +Using the `inline editor`, update the function files according to your project. **Make sure** to update the `requirements.txt` file to match your project's dependencies. + +Also, make sure to set the **Function entry point** to the name of the function decorated with `@functions_framework.http`, which in our case is `crawlee_run`. + +After the Function deploys, you can test it by clicking the "Test" button. This button opens a popup with a `curl` script that calls your new Cloud Function. To avoid having to install the `gcloud` CLI application locally, you can also run this script in the Cloud Shell by clicking the link above the code block. diff --git a/website/versioned_docs/version-0.6/deployment/google_cloud_run.mdx b/website/versioned_docs/version-0.6/deployment/google_cloud_run.mdx new file mode 100644 index 0000000000..c9aef10c3d --- /dev/null +++ b/website/versioned_docs/version-0.6/deployment/google_cloud_run.mdx @@ -0,0 +1,51 @@ +--- +id: gcp-cloud-run +title: Cloud Run +description: Prepare your crawler to run in Cloud Run on Google Cloud Platform. +--- + +import ApiLink from '@site/src/components/ApiLink'; + +import CodeBlock from '@theme/CodeBlock'; + +import GoogleCloudRun from '!!raw-loader!./code_examples/google/cloud_run_example.py'; + + +[Google Cloud Run](https://cloud.google.com/run) is a container-based serverless platform that allows you to run web crawlers with headless browsers. This service is recommended when your Crawlee applications need browser rendering capabilities, require more granular control, or have complex dependencies that aren't supported by [Cloud Functions](./gcp-cloud-run-functions). + +GCP Cloud Run allows you to deploy using Docker containers, giving you full control over your environment and the flexibility to use any web server framework of your choice, unlike Cloud Functions which are limited to [Flask](https://flask.palletsprojects.com/en/stable/). + +## Preparing the project + +We'll prepare our project using [Litestar](https://litestar.dev/) and the [Uvicorn](https://www.uvicorn.org/) web server. The HTTP server handler will wrap the crawler to communicate with clients. Because the Cloud Run platform sees only an opaque Docker container, we have to take care of this bit ourselves. + +:::info + +GCP passes you an environment variable called `PORT` - your HTTP server is expected to be listening on this port (GCP exposes this one to the outer world). + +::: + + + {GoogleCloudRun.replace(/^.*?\n/, '')} + + + +:::tip + +Always make sure to keep all the logic in the request handler - as with other FaaS services, your request handlers have to be **stateless.** + +::: + +## Deploying to Google Cloud Platform + +Now, weโ€™re ready to deploy! If you have initialized your project using `uvx crawlee create`, the initialization script has prepared a Dockerfile for you. + +All you have to do now is run `gcloud run deploy` in your project folder (the one with your Dockerfile in it). The gcloud CLI application will ask you a few questions, such as what region you want to deploy your application in, or whether you want to make your application public or private. + +After answering those questions, you should be able to see your application in the GCP dashboard and run it using the link you find there. + +:::tip + +In case your first execution of your newly created Cloud Run fails, try editing the Run configuration - mainly setting the available memory to 1GiB or more and updating the request timeout according to the size of the website you are scraping. + +::: diff --git a/website/versioned_docs/version-0.6/examples/add_data_to_dataset.mdx b/website/versioned_docs/version-0.6/examples/add_data_to_dataset.mdx new file mode 100644 index 0000000000..aa4164cacf --- /dev/null +++ b/website/versioned_docs/version-0.6/examples/add_data_to_dataset.mdx @@ -0,0 +1,40 @@ +--- +id: add-data-to-dataset +title: Add data to dataset +--- + +import ApiLink from '@site/src/components/ApiLink'; +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; + +import BeautifulSoupExample from '!!raw-loader!roa-loader!./code_examples/add_data_to_dataset_bs.py'; +import PlaywrightExample from '!!raw-loader!roa-loader!./code_examples/add_data_to_dataset_pw.py'; +import DatasetExample from '!!raw-loader!roa-loader!./code_examples/add_data_to_dataset_dataset.py'; + +This example demonstrates how to store extracted data into datasets using the `context.push_data` helper function. If the specified dataset does not already exist, it will be created automatically. Additionally, you can save data to custom datasets by providing `dataset_id` or `dataset_name` parameters to the `push_data` function. + + + + + {BeautifulSoupExample} + + + + + {PlaywrightExample} + + + + +Each item in the dataset will be stored in its own file within the following directory: + +```text +{PROJECT_FOLDER}/storage/datasets/default/ +``` + +For more control, you can also open a dataset manually using the asynchronous constructor `Dataset.open` + + + {DatasetExample} + diff --git a/website/versioned_docs/version-0.6/examples/beautifulsoup_crawler.mdx b/website/versioned_docs/version-0.6/examples/beautifulsoup_crawler.mdx new file mode 100644 index 0000000000..160e4c4d65 --- /dev/null +++ b/website/versioned_docs/version-0.6/examples/beautifulsoup_crawler.mdx @@ -0,0 +1,15 @@ +--- +id: beautifulsoup-crawler +title: BeautifulSoup crawler +--- + +import ApiLink from '@site/src/components/ApiLink'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; + +import BeautifulSoupExample from '!!raw-loader!roa-loader!./code_examples/beautifulsoup_crawler.py'; + +This example demonstrates how to use `BeautifulSoupCrawler` to crawl a list of URLs, load each URL using a plain HTTP request, parse the HTML using the [BeautifulSoup](https://pypi.org/project/beautifulsoup4/) library and extract some data from it - the page title and all `

`, `

` and `

` tags. This setup is perfect for scraping specific elements from web pages. Thanks to the well-known BeautifulSoup, you can easily navigate the HTML structure and retrieve the data you need with minimal code. It also shows how you can add optional pre-navigation hook to the crawler. Pre-navigation hooks are user defined functions that execute before sending the request. + + + {BeautifulSoupExample} + diff --git a/website/versioned_docs/version-0.6/examples/capture_screenshot_using_playwright.mdx b/website/versioned_docs/version-0.6/examples/capture_screenshot_using_playwright.mdx new file mode 100644 index 0000000000..614693b1e8 --- /dev/null +++ b/website/versioned_docs/version-0.6/examples/capture_screenshot_using_playwright.mdx @@ -0,0 +1,19 @@ +--- +id: capture-screenshots-using-playwright +title: Capture screenshots using Playwright +--- + +import ApiLink from '@site/src/components/ApiLink'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; + +import CaptureScreenshotExample from '!!raw-loader!roa-loader!./code_examples/capture_screenshot_using_playwright.py'; + +This example demonstrates how to capture screenshots of web pages using `PlaywrightCrawler` and store them in the key-value store. + +The `PlaywrightCrawler` is configured to automate the browsing and interaction with web pages. It uses headless Chromium as the browser type to perform these tasks. Each web page specified in the initial list of URLs is visited sequentially, and a screenshot of the page is captured using Playwright's `page.screenshot()` method. + +The captured screenshots are stored in the key-value store, which is suitable for managing and storing files in various formats. In this case, screenshots are stored as PNG images with a unique key generated from the URL of the page. + + + {CaptureScreenshotExample} + diff --git a/website/versioned_docs/version-0.6/examples/capturing_page_snapshots_with_error_snapshotter.mdx b/website/versioned_docs/version-0.6/examples/capturing_page_snapshots_with_error_snapshotter.mdx new file mode 100644 index 0000000000..87ff540298 --- /dev/null +++ b/website/versioned_docs/version-0.6/examples/capturing_page_snapshots_with_error_snapshotter.mdx @@ -0,0 +1,27 @@ +--- +id: capturing-page-snapshots-with-error-snapshotter +title: Capturing page snapshots with ErrorSnapshotter +description: How to capture page snapshots on errors. +--- +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; +import ApiLink from '@site/src/components/ApiLink'; +import ParselCrawlerWithErrorSnapshotter from '!!raw-loader!roa-loader!./code_examples/parsel_crawler_with_error_snapshotter.py'; +import PlaywrightCrawlerWithErrorSnapshotter from '!!raw-loader!roa-loader!./code_examples/playwright_crawler_with_error_snapshotter.py'; + + +This example demonstrates how to capture page snapshots on first occurrence of each unique error. The capturing happens automatically if you set `save_error_snapshots=True` in the crawler's `Statistics`. The error snapshot can contain `html` file and `jpeg` file that are created from the page where the unhandled exception was raised. Captured error snapshot files are saved to the default key-value store. Both `PlaywrightCrawler` and [HTTP crawlers](../guides/http-crawlers) are capable of capturing the html file, but only `PlaywrightCrawler` is able to capture page screenshot as well. + + + + + { ParselCrawlerWithErrorSnapshotter } + + + + + { PlaywrightCrawlerWithErrorSnapshotter } + + + diff --git a/website/versioned_docs/version-0.6/examples/code_examples/adaptive_playwright_crawler.py b/website/versioned_docs/version-0.6/examples/code_examples/adaptive_playwright_crawler.py new file mode 100644 index 0000000000..f2851d502b --- /dev/null +++ b/website/versioned_docs/version-0.6/examples/code_examples/adaptive_playwright_crawler.py @@ -0,0 +1,65 @@ +import asyncio +from datetime import timedelta + +from playwright.async_api import Route + +from crawlee.crawlers import ( + AdaptivePlaywrightCrawler, + AdaptivePlaywrightCrawlingContext, + AdaptivePlaywrightPreNavCrawlingContext, +) + + +async def main() -> None: + # Crawler created by following factory method will use `beautifulsoup` + # for parsing static content. + crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser( + max_requests_per_crawl=10, playwright_crawler_specific_kwargs={'headless': False} + ) + + @crawler.router.default_handler + async def request_handler_for_label( + context: AdaptivePlaywrightCrawlingContext, + ) -> None: + # Do some processing using `parsed_content` + context.log.info(context.parsed_content.title) + + # Locate element h2 within 5 seconds + h2 = await context.query_selector_one('h2', timedelta(milliseconds=5000)) + # Do stuff with element found by the selector + context.log.info(h2) + + # Find more links and enqueue them. + await context.enqueue_links() + # Save some data. + await context.push_data({'Visited url': context.request.url}) + + @crawler.pre_navigation_hook + async def hook(context: AdaptivePlaywrightPreNavCrawlingContext) -> None: + """Hook executed both in static sub crawler and playwright sub crawler. + + Trying to access `context.page` in this hook would raise `AdaptiveContextError` + for pages crawled without playwright.""" + context.log.info(f'pre navigation hook for: {context.request.url} ...') + + @crawler.pre_navigation_hook(playwright_only=True) + async def hook_playwright(context: AdaptivePlaywrightPreNavCrawlingContext) -> None: + """Hook executed only in playwright sub crawler. + + It is safe to access `page` object. + """ + + async def some_routing_function(route: Route) -> None: + await route.continue_() + + await context.page.route('*/**', some_routing_function) + context.log.info( + f'Playwright only pre navigation hook for: {context.request.url} ...' + ) + + # Run the crawler with the initial list of URLs. + await crawler.run(['https://warehouse-theme-metal.myshopify.com/']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/examples/code_examples/add_data_to_dataset_bs.py b/website/versioned_docs/version-0.6/examples/code_examples/add_data_to_dataset_bs.py new file mode 100644 index 0000000000..4318cbe0d4 --- /dev/null +++ b/website/versioned_docs/version-0.6/examples/code_examples/add_data_to_dataset_bs.py @@ -0,0 +1,35 @@ +import asyncio + +from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext + + +async def main() -> None: + crawler = BeautifulSoupCrawler() + + # Define the default request handler, which will be called for every request. + @crawler.router.default_handler + async def request_handler(context: BeautifulSoupCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # Extract data from the page. + data = { + 'url': context.request.url, + 'title': context.soup.title.string if context.soup.title else None, + 'html': str(context.soup)[:1000], + } + + # Push the extracted data to the default dataset. + await context.push_data(data) + + # Run the crawler with the initial list of requests. + await crawler.run( + [ + 'https://crawlee.dev', + 'https://apify.com', + 'https://example.com', + ] + ) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/examples/code_examples/add_data_to_dataset_dataset.py b/website/versioned_docs/version-0.6/examples/code_examples/add_data_to_dataset_dataset.py new file mode 100644 index 0000000000..b1d9aba923 --- /dev/null +++ b/website/versioned_docs/version-0.6/examples/code_examples/add_data_to_dataset_dataset.py @@ -0,0 +1,15 @@ +import asyncio + +from crawlee.storages import Dataset + + +async def main() -> None: + # Open dataset manually using asynchronous constructor open(). + dataset = await Dataset.open() + + # Interact with dataset directly. + await dataset.push_data({'key': 'value'}) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/examples/code_examples/add_data_to_dataset_pw.py b/website/versioned_docs/version-0.6/examples/code_examples/add_data_to_dataset_pw.py new file mode 100644 index 0000000000..8eb714aef3 --- /dev/null +++ b/website/versioned_docs/version-0.6/examples/code_examples/add_data_to_dataset_pw.py @@ -0,0 +1,35 @@ +import asyncio + +from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext + + +async def main() -> None: + crawler = PlaywrightCrawler() + + # Define the default request handler, which will be called for every request. + @crawler.router.default_handler + async def request_handler(context: PlaywrightCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # Extract data from the page. + data = { + 'url': context.request.url, + 'title': await context.page.title(), + 'html': str(await context.page.content())[:1000], + } + + # Push the extracted data to the default dataset. + await context.push_data(data) + + # Run the crawler with the initial list of requests. + await crawler.run( + [ + 'https://crawlee.dev', + 'https://apify.com', + 'https://example.com', + ] + ) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/examples/code_examples/beautifulsoup_crawler.py b/website/versioned_docs/version-0.6/examples/code_examples/beautifulsoup_crawler.py new file mode 100644 index 0000000000..5e9701d7cb --- /dev/null +++ b/website/versioned_docs/version-0.6/examples/code_examples/beautifulsoup_crawler.py @@ -0,0 +1,57 @@ +import asyncio +from datetime import timedelta + +from crawlee.crawlers import ( + BasicCrawlingContext, + BeautifulSoupCrawler, + BeautifulSoupCrawlingContext, +) + + +async def main() -> None: + # Create an instance of the BeautifulSoupCrawler class, a crawler that automatically + # loads the URLs and parses their HTML using the BeautifulSoup library. + crawler = BeautifulSoupCrawler( + # On error, retry each page at most once. + max_request_retries=1, + # Increase the timeout for processing each page to 30 seconds. + request_handler_timeout=timedelta(seconds=30), + # Limit the crawl to max requests. Remove or increase it for crawling all links. + max_requests_per_crawl=10, + ) + + # Define the default request handler, which will be called for every request. + # The handler receives a context parameter, providing various properties and + # helper methods. Here are a few key ones we use for demonstration: + # - request: an instance of the Request class containing details such as the URL + # being crawled and the HTTP method used. + # - soup: the BeautifulSoup object containing the parsed HTML of the response. + @crawler.router.default_handler + async def request_handler(context: BeautifulSoupCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # Extract data from the page. + data = { + 'url': context.request.url, + 'title': context.soup.title.string if context.soup.title else None, + 'h1s': [h1.text for h1 in context.soup.find_all('h1')], + 'h2s': [h2.text for h2 in context.soup.find_all('h2')], + 'h3s': [h3.text for h3 in context.soup.find_all('h3')], + } + + # Push the extracted data to the default dataset. In local configuration, + # the data will be stored as JSON files in ./storage/datasets/default. + await context.push_data(data) + + # Register pre navigation hook which will be called before each request. + # This hook is optional and does not need to be defined at all. + @crawler.pre_navigation_hook + async def some_hook(context: BasicCrawlingContext) -> None: + pass + + # Run the crawler with the initial list of URLs. + await crawler.run(['https://crawlee.dev']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/examples/code_examples/beautifulsoup_crawler_keep_alive.py b/website/versioned_docs/version-0.6/examples/code_examples/beautifulsoup_crawler_keep_alive.py new file mode 100644 index 0000000000..38e5623939 --- /dev/null +++ b/website/versioned_docs/version-0.6/examples/code_examples/beautifulsoup_crawler_keep_alive.py @@ -0,0 +1,56 @@ +import asyncio + +from crawlee._types import BasicCrawlingContext +from crawlee.crawlers import BeautifulSoupCrawler + + +async def main() -> None: + crawler = BeautifulSoupCrawler( + # Keep the crawler alive even when there are no requests to be processed now. + keep_alive=True, + ) + + def stop_crawler_if_url_visited(context: BasicCrawlingContext) -> None: + """Stop crawler once specific url is visited. + + Example of guard condition to stop the crawler.""" + if context.request.url == 'https://crawlee.dev/docs/examples': + crawler.stop( + 'Stop crawler that was in keep_alive state after specific url was visite' + ) + else: + context.log.info('keep_alive=True, waiting for more requests to come.') + + async def add_request_later(url: str, after_s: int) -> None: + """Add requests to the queue after some time. Can be done by external code.""" + # Just an example of request being added to the crawler later, + # when it is waiting due to `keep_alive=True`. + await asyncio.sleep(after_s) + await crawler.add_requests([url]) + + # Define the default request handler, which will be called for every request. + @crawler.router.default_handler + async def request_handler(context: BasicCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # Stop crawler if some guard condition has been met. + stop_crawler_if_url_visited(context) + + # Start some tasks that will add some requests later to simulate real situation, + # where requests are added later by external code. + add_request_later_task1 = asyncio.create_task( + add_request_later(url='https://crawlee.dev', after_s=1) + ) + add_request_later_task2 = asyncio.create_task( + add_request_later(url='https://crawlee.dev/docs/examples', after_s=5) + ) + + # Run the crawler without the initial list of requests. + # Wait for more requests to be added to the queue later due to `keep_alive=True`. + await crawler.run() + + await asyncio.gather(add_request_later_task1, add_request_later_task2) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/examples/code_examples/beautifulsoup_crawler_stop.py b/website/versioned_docs/version-0.6/examples/code_examples/beautifulsoup_crawler_stop.py new file mode 100644 index 0000000000..2069bd6ecb --- /dev/null +++ b/website/versioned_docs/version-0.6/examples/code_examples/beautifulsoup_crawler_stop.py @@ -0,0 +1,41 @@ +import asyncio + +from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext + + +async def main() -> None: + # Create an instance of the BeautifulSoupCrawler class, a crawler that automatically + # loads the URLs and parses their HTML using the BeautifulSoup library. + crawler = BeautifulSoupCrawler() + + # Define the default request handler, which will be called for every request. + # The handler receives a context parameter, providing various properties and + # helper methods. Here are a few key ones we use for demonstration: + # - request: an instance of the Request class containing details such as the URL + # being crawled and the HTTP method used. + # - soup: the BeautifulSoup object containing the parsed HTML of the response. + @crawler.router.default_handler + async def request_handler(context: BeautifulSoupCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # Create custom condition to stop crawler once it finds what it is looking for. + if 'crawlee' in context.request.url: + crawler.stop( + reason='Manual stop of crawler after finding `crawlee` in the url.' + ) + + # Extract data from the page. + data = { + 'url': context.request.url, + } + + # Push the extracted data to the default dataset. In local configuration, + # the data will be stored as JSON files in ./storage/datasets/default. + await context.push_data(data) + + # Run the crawler with the initial list of URLs. + await crawler.run(['https://crawlee.dev']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/examples/code_examples/capture_screenshot_using_playwright.py b/website/versioned_docs/version-0.6/examples/code_examples/capture_screenshot_using_playwright.py new file mode 100644 index 0000000000..e4b4c1ec22 --- /dev/null +++ b/website/versioned_docs/version-0.6/examples/code_examples/capture_screenshot_using_playwright.py @@ -0,0 +1,47 @@ +import asyncio + +from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext +from crawlee.storages import KeyValueStore + + +async def main() -> None: + crawler = PlaywrightCrawler( + # Limit the crawl to max requests. Remove or increase it for crawling all links. + max_requests_per_crawl=10, + # Headless mode, set to False to see the browser in action. + headless=False, + # Browser types supported by Playwright. + browser_type='chromium', + ) + + # Open the default key-value store. + kvs = await KeyValueStore.open() + + # Define the default request handler, which will be called for every request. + @crawler.router.default_handler + async def request_handler(context: PlaywrightCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # Capture the screenshot of the page using Playwright's API. + screenshot = await context.page.screenshot() + name = context.request.url.split('/')[-1] + + # Store the screenshot in the key-value store. + await kvs.set_value( + key=f'screenshot-{name}', + value=screenshot, + content_type='image/png', + ) + + # Run the crawler with the initial list of URLs. + await crawler.run( + [ + 'https://crawlee.dev', + 'https://apify.com', + 'https://example.com', + ] + ) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/examples/code_examples/configure_json_logging.py b/website/versioned_docs/version-0.6/examples/code_examples/configure_json_logging.py new file mode 100644 index 0000000000..25cb37c745 --- /dev/null +++ b/website/versioned_docs/version-0.6/examples/code_examples/configure_json_logging.py @@ -0,0 +1,91 @@ +from __future__ import annotations + +import asyncio +import inspect +import logging +import sys +from typing import TYPE_CHECKING + +from loguru import logger + +from crawlee.crawlers import HttpCrawler, HttpCrawlingContext + +if TYPE_CHECKING: + from loguru import Record + + +# Configure loguru interceptor to capture standard logging output +class InterceptHandler(logging.Handler): + def emit(self, record: logging.LogRecord) -> None: + # Get corresponding Loguru level if it exists + try: + level: str | int = logger.level(record.levelname).name + except ValueError: + level = record.levelno + + # Find caller from where originated the logged message + frame, depth = inspect.currentframe(), 0 + while frame: + filename = frame.f_code.co_filename + is_logging = filename == logging.__file__ + is_frozen = 'importlib' in filename and '_bootstrap' in filename + if depth > 0 and not (is_logging | is_frozen): + break + frame = frame.f_back + depth += 1 + + dummy_record = logging.LogRecord('dummy', 0, 'dummy', 0, 'dummy', None, None) + standard_attrs = set(dummy_record.__dict__.keys()) + extra_dict = { + key: value + for key, value in record.__dict__.items() + if key not in standard_attrs + } + + ( + logger.bind(**extra_dict) + .opt(depth=depth, exception=record.exc_info) + .patch(lambda loguru_record: loguru_record.update({'name': record.name})) + .log(level, record.getMessage()) + ) + + +# Configure loguru formatter +def formatter(record: Record) -> str: + basic_format = '[{name}] | {level: ^8} | - {message}' + if record['extra']: + basic_format = basic_format + ' {extra}' + return f'{basic_format}\n' + + +# Remove default loguru logger +logger.remove() + +# Set up loguru with JSONL serialization in file `crawler.log` +logger.add('crawler.log', format=formatter, serialize=True, level='INFO') + +# Set up loguru logger for console +logger.add(sys.stderr, format=formatter, colorize=True, level='INFO') + +# Configure standard logging to use our interceptor +logging.basicConfig(handlers=[InterceptHandler()], level=logging.INFO, force=True) + + +async def main() -> None: + # Initialize crawler with disabled table logs + crawler = HttpCrawler( + configure_logging=False, # Disable default logging configuration + statistics_log_format='inline', # Set inline formatting for statistics logs + ) + + # Define the default request handler, which will be called for every request. + @crawler.router.default_handler + async def request_handler(context: HttpCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # Run the crawler + await crawler.run(['https://www.crawlee.dev/']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/examples/code_examples/crawl_all_links_on_website_bs.py b/website/versioned_docs/version-0.6/examples/code_examples/crawl_all_links_on_website_bs.py new file mode 100644 index 0000000000..ad5ef62f54 --- /dev/null +++ b/website/versioned_docs/version-0.6/examples/code_examples/crawl_all_links_on_website_bs.py @@ -0,0 +1,25 @@ +import asyncio + +from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext + + +async def main() -> None: + crawler = BeautifulSoupCrawler( + # Limit the crawl to max requests. Remove or increase it for crawling all links. + max_requests_per_crawl=10, + ) + + # Define the default request handler, which will be called for every request. + @crawler.router.default_handler + async def request_handler(context: BeautifulSoupCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # Enqueue all links found on the page. + await context.enqueue_links() + + # Run the crawler with the initial list of requests. + await crawler.run(['https://crawlee.dev']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/examples/code_examples/crawl_all_links_on_website_pw.py b/website/versioned_docs/version-0.6/examples/code_examples/crawl_all_links_on_website_pw.py new file mode 100644 index 0000000000..4a6fb6e616 --- /dev/null +++ b/website/versioned_docs/version-0.6/examples/code_examples/crawl_all_links_on_website_pw.py @@ -0,0 +1,25 @@ +import asyncio + +from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext + + +async def main() -> None: + crawler = PlaywrightCrawler( + # Limit the crawl to max requests. Remove or increase it for crawling all links. + max_requests_per_crawl=10, + ) + + # Define the default request handler, which will be called for every request. + @crawler.router.default_handler + async def request_handler(context: PlaywrightCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # Enqueue all links found on the page. + await context.enqueue_links() + + # Run the crawler with the initial list of requests. + await crawler.run(['https://crawlee.dev']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/examples/code_examples/crawl_multiple_urls_bs.py b/website/versioned_docs/version-0.6/examples/code_examples/crawl_multiple_urls_bs.py new file mode 100644 index 0000000000..e8cf82f2bc --- /dev/null +++ b/website/versioned_docs/version-0.6/examples/code_examples/crawl_multiple_urls_bs.py @@ -0,0 +1,25 @@ +import asyncio + +from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext + + +async def main() -> None: + crawler = BeautifulSoupCrawler() + + # Define the default request handler, which will be called for every request. + @crawler.router.default_handler + async def request_handler(context: BeautifulSoupCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # Run the crawler with the initial list of requests. + await crawler.run( + [ + 'https://crawlee.dev', + 'https://apify.com', + 'https://example.com', + ] + ) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/examples/code_examples/crawl_multiple_urls_pw.py b/website/versioned_docs/version-0.6/examples/code_examples/crawl_multiple_urls_pw.py new file mode 100644 index 0000000000..b18d04c8ad --- /dev/null +++ b/website/versioned_docs/version-0.6/examples/code_examples/crawl_multiple_urls_pw.py @@ -0,0 +1,25 @@ +import asyncio + +from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext + + +async def main() -> None: + crawler = PlaywrightCrawler() + + # Define the default request handler, which will be called for every request. + @crawler.router.default_handler + async def request_handler(context: PlaywrightCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # Run the crawler with the initial list of requests. + await crawler.run( + [ + 'https://crawlee.dev', + 'https://apify.com', + 'https://example.com', + ] + ) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/examples/code_examples/crawl_specific_links_on_website_bs.py b/website/versioned_docs/version-0.6/examples/code_examples/crawl_specific_links_on_website_bs.py new file mode 100644 index 0000000000..8dfc1bdf85 --- /dev/null +++ b/website/versioned_docs/version-0.6/examples/code_examples/crawl_specific_links_on_website_bs.py @@ -0,0 +1,29 @@ +import asyncio + +from crawlee import Glob +from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext + + +async def main() -> None: + crawler = BeautifulSoupCrawler( + # Limit the crawl to max requests. Remove or increase it for crawling all links. + max_requests_per_crawl=10, + ) + + # Define the default request handler, which will be called for every request. + @crawler.router.default_handler + async def request_handler(context: BeautifulSoupCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # Enqueue all the documentation links found on the page, except for the examples. + await context.enqueue_links( + include=[Glob('https://crawlee.dev/docs/**')], + exclude=[Glob('https://crawlee.dev/docs/examples')], + ) + + # Run the crawler with the initial list of requests. + await crawler.run(['https://crawlee.dev']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/examples/code_examples/crawl_specific_links_on_website_pw.py b/website/versioned_docs/version-0.6/examples/code_examples/crawl_specific_links_on_website_pw.py new file mode 100644 index 0000000000..98a2f0435b --- /dev/null +++ b/website/versioned_docs/version-0.6/examples/code_examples/crawl_specific_links_on_website_pw.py @@ -0,0 +1,29 @@ +import asyncio + +from crawlee import Glob +from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext + + +async def main() -> None: + crawler = PlaywrightCrawler( + # Limit the crawl to max requests. Remove or increase it for crawling all links. + max_requests_per_crawl=10, + ) + + # Define the default request handler, which will be called for every request. + @crawler.router.default_handler + async def request_handler(context: PlaywrightCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # Enqueue all the documentation links found on the page, except for the examples. + await context.enqueue_links( + include=[Glob('https://crawlee.dev/docs/**')], + exclude=[Glob('https://crawlee.dev/docs/examples')], + ) + + # Run the crawler with the initial list of requests. + await crawler.run(['https://crawlee.dev']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/examples/code_examples/crawl_website_with_relative_links_all_links.py b/website/versioned_docs/version-0.6/examples/code_examples/crawl_website_with_relative_links_all_links.py new file mode 100644 index 0000000000..b253a9566f --- /dev/null +++ b/website/versioned_docs/version-0.6/examples/code_examples/crawl_website_with_relative_links_all_links.py @@ -0,0 +1,26 @@ +import asyncio + +from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext + + +async def main() -> None: + crawler = BeautifulSoupCrawler( + # Limit the crawl to max requests. Remove or increase it for crawling all links. + max_requests_per_crawl=10, + ) + + # Define the default request handler, which will be called for every request. + @crawler.router.default_handler + async def request_handler(context: BeautifulSoupCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # Enqueue all links found on the page. Any URLs found will be matched by + # this strategy, even if they go off the site you are currently crawling. + await context.enqueue_links(strategy='all') + + # Run the crawler with the initial list of requests. + await crawler.run(['https://crawlee.dev']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/examples/code_examples/crawl_website_with_relative_links_same_domain.py b/website/versioned_docs/version-0.6/examples/code_examples/crawl_website_with_relative_links_same_domain.py new file mode 100644 index 0000000000..0fa264ef20 --- /dev/null +++ b/website/versioned_docs/version-0.6/examples/code_examples/crawl_website_with_relative_links_same_domain.py @@ -0,0 +1,26 @@ +import asyncio + +from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext + + +async def main() -> None: + crawler = BeautifulSoupCrawler( + # Limit the crawl to max requests. Remove or increase it for crawling all links. + max_requests_per_crawl=10, + ) + + # Define the default request handler, which will be called for every request. + @crawler.router.default_handler + async def request_handler(context: BeautifulSoupCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # Setting the strategy to same domain will enqueue all links found that + # are on the same hostname as request.loaded_url or request.url. + await context.enqueue_links(strategy='same-domain') + + # Run the crawler with the initial list of requests. + await crawler.run(['https://crawlee.dev']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/examples/code_examples/crawl_website_with_relative_links_same_hostname.py b/website/versioned_docs/version-0.6/examples/code_examples/crawl_website_with_relative_links_same_hostname.py new file mode 100644 index 0000000000..0259cafe67 --- /dev/null +++ b/website/versioned_docs/version-0.6/examples/code_examples/crawl_website_with_relative_links_same_hostname.py @@ -0,0 +1,26 @@ +import asyncio + +from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext + + +async def main() -> None: + crawler = BeautifulSoupCrawler( + # Limit the crawl to max requests. Remove or increase it for crawling all links. + max_requests_per_crawl=10, + ) + + # Define the default request handler, which will be called for every request. + @crawler.router.default_handler + async def request_handler(context: BeautifulSoupCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # Setting the strategy to same hostname will enqueue all links found that are on + # the same hostname (including subdomains) as request.loaded_url or request.url. + await context.enqueue_links(strategy='same-hostname') + + # Run the crawler with the initial list of requests. + await crawler.run(['https://crawlee.dev']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/examples/code_examples/crawl_website_with_relative_links_same_origin.py b/website/versioned_docs/version-0.6/examples/code_examples/crawl_website_with_relative_links_same_origin.py new file mode 100644 index 0000000000..46e9f32759 --- /dev/null +++ b/website/versioned_docs/version-0.6/examples/code_examples/crawl_website_with_relative_links_same_origin.py @@ -0,0 +1,26 @@ +import asyncio + +from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext + + +async def main() -> None: + crawler = BeautifulSoupCrawler( + # Limit the crawl to max requests. Remove or increase it for crawling all links. + max_requests_per_crawl=10, + ) + + # Define the default request handler, which will be called for every request. + @crawler.router.default_handler + async def request_handler(context: BeautifulSoupCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # Setting the strategy to same origin will enqueue all links found that are on + # the same origin as request.loaded_url or request.url. + await context.enqueue_links(strategy='same-origin') + + # Run the crawler with the initial list of requests. + await crawler.run(['https://crawlee.dev']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/examples/code_examples/export_entire_dataset_to_file_csv.py b/website/versioned_docs/version-0.6/examples/code_examples/export_entire_dataset_to_file_csv.py new file mode 100644 index 0000000000..115474fc61 --- /dev/null +++ b/website/versioned_docs/version-0.6/examples/code_examples/export_entire_dataset_to_file_csv.py @@ -0,0 +1,37 @@ +import asyncio + +from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext + + +async def main() -> None: + crawler = BeautifulSoupCrawler( + # Limit the crawl to max requests. Remove or increase it for crawling all links. + max_requests_per_crawl=10, + ) + + # Define the default request handler, which will be called for every request. + @crawler.router.default_handler + async def request_handler(context: BeautifulSoupCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # Extract data from the page. + data = { + 'url': context.request.url, + 'title': context.soup.title.string if context.soup.title else None, + } + + # Enqueue all links found on the page. + await context.enqueue_links() + + # Push the extracted data to the default dataset. + await context.push_data(data) + + # Run the crawler with the initial list of URLs. + await crawler.run(['https://crawlee.dev']) + + # Export the entire dataset to a CSV file. + await crawler.export_data_csv(path='results.csv') + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/examples/code_examples/export_entire_dataset_to_file_json.py b/website/versioned_docs/version-0.6/examples/code_examples/export_entire_dataset_to_file_json.py new file mode 100644 index 0000000000..5c871fb228 --- /dev/null +++ b/website/versioned_docs/version-0.6/examples/code_examples/export_entire_dataset_to_file_json.py @@ -0,0 +1,37 @@ +import asyncio + +from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext + + +async def main() -> None: + crawler = BeautifulSoupCrawler( + # Limit the crawl to max requests. Remove or increase it for crawling all links. + max_requests_per_crawl=10, + ) + + # Define the default request handler, which will be called for every request. + @crawler.router.default_handler + async def request_handler(context: BeautifulSoupCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # Extract data from the page. + data = { + 'url': context.request.url, + 'title': context.soup.title.string if context.soup.title else None, + } + + # Enqueue all links found on the page. + await context.enqueue_links() + + # Push the extracted data to the default dataset. + await context.push_data(data) + + # Run the crawler with the initial list of URLs. + await crawler.run(['https://crawlee.dev']) + + # Export the entire dataset to a JSON file. + await crawler.export_data_json(path='results.json') + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/examples/code_examples/extract_and_add_specific_links_on_website_bs.py b/website/versioned_docs/version-0.6/examples/code_examples/extract_and_add_specific_links_on_website_bs.py new file mode 100644 index 0000000000..1fcafea1d6 --- /dev/null +++ b/website/versioned_docs/version-0.6/examples/code_examples/extract_and_add_specific_links_on_website_bs.py @@ -0,0 +1,36 @@ +import asyncio + +from crawlee import Glob +from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext + + +async def main() -> None: + crawler = BeautifulSoupCrawler( + # Limit the crawl to max requests. Remove or increase it for crawling all links. + max_requests_per_crawl=10, + ) + + # Define the default request handler, which will be called for every request. + @crawler.router.default_handler + async def request_handler(context: BeautifulSoupCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # Extract all the documentation links found on the page, except for the examples. + extracted_links = await context.extract_links( + include=[Glob('https://crawlee.dev/docs/**')], + exclude=[Glob('https://crawlee.dev/docs/examples')], + ) + # Some very custom filtering which can't be achieved by `extract_links` arguments. + max_link_length = 30 + filtered_links = [ + link for link in extracted_links if len(link.url) < max_link_length + ] + # Add filtered links to the request queue. + await context.add_requests(filtered_links) + + # Run the crawler with the initial list of requests. + await crawler.run(['https://crawlee.dev']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/examples/code_examples/extract_and_add_specific_links_on_website_pw.py b/website/versioned_docs/version-0.6/examples/code_examples/extract_and_add_specific_links_on_website_pw.py new file mode 100644 index 0000000000..032a25f19c --- /dev/null +++ b/website/versioned_docs/version-0.6/examples/code_examples/extract_and_add_specific_links_on_website_pw.py @@ -0,0 +1,36 @@ +import asyncio + +from crawlee import Glob +from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext + + +async def main() -> None: + crawler = PlaywrightCrawler( + # Limit the crawl to max requests. Remove or increase it for crawling all links. + max_requests_per_crawl=10, + ) + + # Define the default request handler, which will be called for every request. + @crawler.router.default_handler + async def request_handler(context: PlaywrightCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # Extract all the documentation links found on the page, except for the examples. + extracted_links = await context.extract_links( + include=[Glob('https://crawlee.dev/docs/**')], + exclude=[Glob('https://crawlee.dev/docs/examples')], + ) + # Some very custom filtering which can't be achieved by `extract_links` arguments. + max_link_length = 30 + filtered_links = [ + link for link in extracted_links if len(link.url) < max_link_length + ] + # Add filtered links to the request queue. + await context.add_requests(filtered_links) + + # Run the crawler with the initial list of requests. + await crawler.run(['https://crawlee.dev']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/examples/code_examples/fill_and_submit_web_form_crawler.py b/website/versioned_docs/version-0.6/examples/code_examples/fill_and_submit_web_form_crawler.py new file mode 100644 index 0000000000..c00a784411 --- /dev/null +++ b/website/versioned_docs/version-0.6/examples/code_examples/fill_and_submit_web_form_crawler.py @@ -0,0 +1,41 @@ +import asyncio +from urllib.parse import urlencode + +from crawlee import Request +from crawlee.crawlers import HttpCrawler, HttpCrawlingContext + + +async def main() -> None: + crawler = HttpCrawler() + + # Define the default request handler, which will be called for every request. + @crawler.router.default_handler + async def request_handler(context: HttpCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + response = context.http_response.read().decode('utf-8') + context.log.info(f'Response: {response}') # To see the response in the logs. + + # Prepare a POST request to the form endpoint. + request = Request.from_url( + url='https://httpbin.org/post', + method='POST', + headers={'content-type': 'application/x-www-form-urlencoded'}, + payload=urlencode( + { + 'custname': 'John Doe', + 'custtel': '1234567890', + 'custemail': 'johndoe@example.com', + 'size': 'large', + 'topping': ['bacon', 'cheese', 'mushroom'], + 'delivery': '13:00', + 'comments': 'Please ring the doorbell upon arrival.', + } + ).encode(), + ) + + # Run the crawler with the initial list of requests. + await crawler.run([request]) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/examples/code_examples/fill_and_submit_web_form_request.py b/website/versioned_docs/version-0.6/examples/code_examples/fill_and_submit_web_form_request.py new file mode 100644 index 0000000000..14dc6c479d --- /dev/null +++ b/website/versioned_docs/version-0.6/examples/code_examples/fill_and_submit_web_form_request.py @@ -0,0 +1,28 @@ +import asyncio +from urllib.parse import urlencode + +from crawlee import Request + + +async def main() -> None: + # Prepare a POST request to the form endpoint. + request = Request.from_url( + url='https://httpbin.org/post', + method='POST', + headers={'content-type': 'application/x-www-form-urlencoded'}, + payload=urlencode( + { + 'custname': 'John Doe', + 'custtel': '1234567890', + 'custemail': 'johndoe@example.com', + 'size': 'large', + 'topping': ['bacon', 'cheese', 'mushroom'], + 'delivery': '13:00', + 'comments': 'Please ring the doorbell upon arrival.', + } + ).encode(), + ) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/examples/code_examples/parsel_crawler.py b/website/versioned_docs/version-0.6/examples/code_examples/parsel_crawler.py new file mode 100644 index 0000000000..61ddb7484e --- /dev/null +++ b/website/versioned_docs/version-0.6/examples/code_examples/parsel_crawler.py @@ -0,0 +1,47 @@ +import asyncio + +from crawlee.crawlers import BasicCrawlingContext, ParselCrawler, ParselCrawlingContext + +# Regex for identifying email addresses on a webpage. +EMAIL_REGEX = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}' + + +async def main() -> None: + crawler = ParselCrawler( + # Limit the crawl to max requests. Remove or increase it for crawling all links. + max_requests_per_crawl=10, + ) + + # Define the default request handler, which will be called for every request. + @crawler.router.default_handler + async def request_handler(context: ParselCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # Extract data from the page. + data = { + 'url': context.request.url, + 'title': context.selector.xpath('//title/text()').get(), + 'email_address_list': context.selector.re(EMAIL_REGEX), + } + + # Push the extracted data to the default dataset. + await context.push_data(data) + + # Enqueue all links found on the page. + await context.enqueue_links() + + # Register pre navigation hook which will be called before each request. + # This hook is optional and does not need to be defined at all. + @crawler.pre_navigation_hook + async def some_hook(context: BasicCrawlingContext) -> None: + pass + + # Run the crawler with the initial list of URLs. + await crawler.run(['https://github.com']) + + # Export the entire dataset to a JSON file. + await crawler.export_data_json(path='results.json') + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/examples/code_examples/parsel_crawler_with_error_snapshotter.py b/website/versioned_docs/version-0.6/examples/code_examples/parsel_crawler_with_error_snapshotter.py new file mode 100644 index 0000000000..d7c3674571 --- /dev/null +++ b/website/versioned_docs/version-0.6/examples/code_examples/parsel_crawler_with_error_snapshotter.py @@ -0,0 +1,31 @@ +import asyncio +from random import choice + +from crawlee.crawlers import ParselCrawler, ParselCrawlingContext +from crawlee.statistics import Statistics + + +async def main() -> None: + crawler = ParselCrawler( + statistics=Statistics.with_default_state(save_error_snapshots=True) + ) + + @crawler.router.default_handler + async def request_handler(context: ParselCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + # Simulate various errors to demonstrate `ErrorSnapshotter` + # saving only the first occurrence of unique error. + await context.enqueue_links() + random_number = choice(range(10)) + if random_number == 1: + raise KeyError('Some KeyError') + if random_number == 2: + raise ValueError('Some ValueError') + if random_number == 3: + raise RuntimeError('Some RuntimeError') + + await crawler.run(['https://crawlee.dev']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/examples/code_examples/playwright_block_requests.py b/website/versioned_docs/version-0.6/examples/code_examples/playwright_block_requests.py new file mode 100644 index 0000000000..991a67aede --- /dev/null +++ b/website/versioned_docs/version-0.6/examples/code_examples/playwright_block_requests.py @@ -0,0 +1,36 @@ +import asyncio + +from crawlee.crawlers import ( + PlaywrightCrawler, + PlaywrightCrawlingContext, + PlaywrightPreNavCrawlingContext, +) + + +async def main() -> None: + crawler = PlaywrightCrawler( + # Limit the crawl to max requests. Remove or increase it for crawling all links. + max_requests_per_crawl=10, + ) + + # Define the default request handler, which will be called for every request. + @crawler.router.default_handler + async def request_handler(context: PlaywrightCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + await context.enqueue_links() + + # Define the hook, which will be called before every request. + @crawler.pre_navigation_hook + async def navigation_hook(context: PlaywrightPreNavCrawlingContext) -> None: + context.log.info(f'Navigating to {context.request.url} ...') + + # Block all requests to URLs that include `adsbygoogle.js` and also all defaults. + await context.block_requests(extra_url_patterns=['adsbygoogle.js']) + + # Run the crawler with the initial list of URLs. + await crawler.run(['https://crawlee.dev/']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/examples/code_examples/playwright_crawler.py b/website/versioned_docs/version-0.6/examples/code_examples/playwright_crawler.py new file mode 100644 index 0000000000..f35332b063 --- /dev/null +++ b/website/versioned_docs/version-0.6/examples/code_examples/playwright_crawler.py @@ -0,0 +1,67 @@ +import asyncio + +from crawlee.crawlers import ( + PlaywrightCrawler, + PlaywrightCrawlingContext, + PlaywrightPreNavCrawlingContext, +) + + +async def main() -> None: + crawler = PlaywrightCrawler( + # Limit the crawl to max requests. Remove or increase it for crawling all links. + max_requests_per_crawl=10, + # Headless mode, set to False to see the browser in action. + headless=False, + # Browser types supported by Playwright. + browser_type='chromium', + ) + + # Define the default request handler, which will be called for every request. + # The handler receives a context parameter, providing various properties and + # helper methods. Here are a few key ones we use for demonstration: + # - request: an instance of the Request class containing details such as the URL + # being crawled and the HTTP method used. + # - page: Playwright's Page object, which allows interaction with the web page + # (see https://playwright.dev/python/docs/api/class-page for more details). + @crawler.router.default_handler + async def request_handler(context: PlaywrightCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # Extract data from the page using Playwright's API. + posts = await context.page.query_selector_all('.athing') + data = [] + + for post in posts: + # Get the HTML elements for the title and rank within each post. + title_element = await post.query_selector('.title a') + rank_element = await post.query_selector('.rank') + + # Extract the data we want from the elements. + title = await title_element.inner_text() if title_element else None + rank = await rank_element.inner_text() if rank_element else None + href = await title_element.get_attribute('href') if title_element else None + + data.append({'title': title, 'rank': rank, 'href': href}) + + # Push the extracted data to the default dataset. In local configuration, + # the data will be stored as JSON files in ./storage/datasets/default. + await context.push_data(data) + + # Find a link to the next page and enqueue it if it exists. + await context.enqueue_links(selector='.morelink') + + # Define a hook that will be called each time before navigating to a new URL. + # The hook receives a context parameter, providing access to the request and + # browser page among other things. In this example, we log the URL being + # navigated to. + @crawler.pre_navigation_hook + async def log_navigation_url(context: PlaywrightPreNavCrawlingContext) -> None: + context.log.info(f'Navigating to {context.request.url} ...') + + # Run the crawler with the initial list of URLs. + await crawler.run(['https://news.ycombinator.com/']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/examples/code_examples/playwright_crawler_with_camoufox.py b/website/versioned_docs/version-0.6/examples/code_examples/playwright_crawler_with_camoufox.py new file mode 100644 index 0000000000..691197da55 --- /dev/null +++ b/website/versioned_docs/version-0.6/examples/code_examples/playwright_crawler_with_camoufox.py @@ -0,0 +1,69 @@ +import asyncio + +# Camoufox is external package and needs to be installed. It is not included in crawlee. +from camoufox import AsyncNewBrowser +from typing_extensions import override + +from crawlee.browsers import ( + BrowserPool, + PlaywrightBrowserController, + PlaywrightBrowserPlugin, +) +from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext + + +class CamoufoxPlugin(PlaywrightBrowserPlugin): + """Example browser plugin that uses Camoufox browser, + but otherwise keeps the functionality of PlaywrightBrowserPlugin. + """ + + @override + async def new_browser(self) -> PlaywrightBrowserController: + if not self._playwright: + raise RuntimeError('Playwright browser plugin is not initialized.') + + return PlaywrightBrowserController( + browser=await AsyncNewBrowser( + self._playwright, **self._browser_launch_options + ), + # Increase, if camoufox can handle it in your use case. + max_open_pages_per_browser=1, + # This turns off the crawlee header_generation. Camoufox has its own. + header_generator=None, + ) + + +async def main() -> None: + crawler = PlaywrightCrawler( + # Limit the crawl to max requests. Remove or increase it for crawling all links. + max_requests_per_crawl=10, + # Custom browser pool. Gives users full control over browsers used by the crawler. + browser_pool=BrowserPool(plugins=[CamoufoxPlugin()]), + ) + + # Define the default request handler, which will be called for every request. + @crawler.router.default_handler + async def request_handler(context: PlaywrightCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # Extract some data from the page using Playwright's API. + posts = await context.page.query_selector_all('.athing') + for post in posts: + # Get the HTML elements for the title and rank within each post. + title_element = await post.query_selector('.title a') + + # Extract the data we want from the elements. + title = await title_element.inner_text() if title_element else None + + # Push the extracted data to the default dataset. + await context.push_data({'title': title}) + + # Find a link to the next page and enqueue it if it exists. + await context.enqueue_links(selector='.morelink') + + # Run the crawler with the initial list of URLs. + await crawler.run(['https://news.ycombinator.com/']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/examples/code_examples/playwright_crawler_with_error_snapshotter.py b/website/versioned_docs/version-0.6/examples/code_examples/playwright_crawler_with_error_snapshotter.py new file mode 100644 index 0000000000..90ddc6c3d4 --- /dev/null +++ b/website/versioned_docs/version-0.6/examples/code_examples/playwright_crawler_with_error_snapshotter.py @@ -0,0 +1,31 @@ +import asyncio +from random import choice + +from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext +from crawlee.statistics import Statistics + + +async def main() -> None: + crawler = PlaywrightCrawler( + statistics=Statistics.with_default_state(save_error_snapshots=True) + ) + + @crawler.router.default_handler + async def request_handler(context: PlaywrightCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + # Simulate various errors to demonstrate `ErrorSnapshotter` + # saving only the first occurrence of unique error. + await context.enqueue_links() + random_number = choice(range(10)) + if random_number == 1: + raise KeyError('Some KeyError') + if random_number == 2: + raise ValueError('Some ValueError') + if random_number == 3: + raise RuntimeError('Some RuntimeError') + + await crawler.run(['https://crawlee.dev']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/examples/code_examples/playwright_crawler_with_fingerprint_generator.py b/website/versioned_docs/version-0.6/examples/code_examples/playwright_crawler_with_fingerprint_generator.py new file mode 100644 index 0000000000..a5d80797fb --- /dev/null +++ b/website/versioned_docs/version-0.6/examples/code_examples/playwright_crawler_with_fingerprint_generator.py @@ -0,0 +1,44 @@ +import asyncio + +from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext +from crawlee.fingerprint_suite import ( + DefaultFingerprintGenerator, + HeaderGeneratorOptions, + ScreenOptions, +) + + +async def main() -> None: + # Use default fingerprint generator with desired fingerprint options. + # Generator will generate real looking browser fingerprint based on the options. + # Unspecified fingerprint options will be automatically selected by the generator. + fingerprint_generator = DefaultFingerprintGenerator( + header_options=HeaderGeneratorOptions(browsers=['chromium']), + screen_options=ScreenOptions(min_width=400), + ) + + crawler = PlaywrightCrawler( + # Limit the crawl to max requests. Remove or increase it for crawling all links. + max_requests_per_crawl=10, + # Headless mode, set to False to see the browser in action. + headless=False, + # Browser types supported by Playwright. + browser_type='chromium', + # Fingerprint generator to be used. By default no fingerprint generation is done. + fingerprint_generator=fingerprint_generator, + ) + + # Define the default request handler, which will be called for every request. + @crawler.router.default_handler + async def request_handler(context: PlaywrightCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # Find a link to the next page and enqueue it if it exists. + await context.enqueue_links(selector='.morelink') + + # Run the crawler with the initial list of URLs. + await crawler.run(['https://news.ycombinator.com/']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/examples/code_examples/respect_robots_on_skipped_request.py b/website/versioned_docs/version-0.6/examples/code_examples/respect_robots_on_skipped_request.py new file mode 100644 index 0000000000..5c7eca173f --- /dev/null +++ b/website/versioned_docs/version-0.6/examples/code_examples/respect_robots_on_skipped_request.py @@ -0,0 +1,36 @@ +import asyncio + +from crawlee import SkippedReason +from crawlee.crawlers import ( + BeautifulSoupCrawler, + BeautifulSoupCrawlingContext, +) + + +async def main() -> None: + # Initialize the crawler with robots.txt compliance enabled + crawler = BeautifulSoupCrawler(respect_robots_txt_file=True) + + @crawler.router.default_handler + async def request_handler(context: BeautifulSoupCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # highlight-start + # This handler is called when a request is skipped + @crawler.on_skipped_request + async def skipped_request_handler(url: str, reason: SkippedReason) -> None: + # Check if the request was skipped due to robots.txt rules + if reason == 'robots_txt': + crawler.log.info(f'Skipped {url} due to robots.txt rules.') + + # highlight-end + + # Start the crawler with the specified URLs + # The login URL will be skipped and handled by the skipped_request_handler + await crawler.run( + ['https://news.ycombinator.com/', 'https://news.ycombinator.com/login'] + ) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/examples/code_examples/respect_robots_txt_file.py b/website/versioned_docs/version-0.6/examples/code_examples/respect_robots_txt_file.py new file mode 100644 index 0000000000..ebd63b1c2e --- /dev/null +++ b/website/versioned_docs/version-0.6/examples/code_examples/respect_robots_txt_file.py @@ -0,0 +1,27 @@ +import asyncio + +from crawlee.crawlers import ( + BeautifulSoupCrawler, + BeautifulSoupCrawlingContext, +) + + +async def main() -> None: + # Initialize the crawler with robots.txt compliance enabled + crawler = BeautifulSoupCrawler(respect_robots_txt_file=True) + + @crawler.router.default_handler + async def request_handler(context: BeautifulSoupCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # Start the crawler with the specified URLs + # The crawler will check the robots.txt file before making requests + # In this example, 'https://news.ycombinator.com/login' will be skipped + # because it's disallowed in the site's robots.txt file + await crawler.run( + ['https://news.ycombinator.com/', 'https://news.ycombinator.com/login'] + ) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/examples/code_examples/resuming_paused_crawl.py b/website/versioned_docs/version-0.6/examples/code_examples/resuming_paused_crawl.py new file mode 100644 index 0000000000..e87e428469 --- /dev/null +++ b/website/versioned_docs/version-0.6/examples/code_examples/resuming_paused_crawl.py @@ -0,0 +1,40 @@ +import asyncio + +from crawlee import ConcurrencySettings, service_locator +from crawlee.crawlers import ( + BeautifulSoupCrawler, + BeautifulSoupCrawlingContext, +) + +# Disable clearing the `RequestQueue`, `KeyValueStore` and `Dataset` on each run. +# This makes the scraper continue from where it left off in the previous run. +# The recommended way to achieve this behavior is setting the environment variable +# `CRAWLEE_PURGE_ON_START=0` +configuration = service_locator.get_configuration() +configuration.purge_on_start = False + + +async def main() -> None: + crawler = BeautifulSoupCrawler( + # Let's slow down the crawler for a demonstration + concurrency_settings=ConcurrencySettings(max_tasks_per_minute=20) + ) + + @crawler.router.default_handler + async def request_handler(context: BeautifulSoupCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # List of links for crawl + requests = [ + 'https://crawlee.dev', + 'https://crawlee.dev/python/docs', + 'https://crawlee.dev/python/docs/examples', + 'https://crawlee.dev/python/docs/guides', + 'https://crawlee.dev/python/docs/quick-start', + ] + + await crawler.run(requests) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/examples/crawl_all_links_on_website.mdx b/website/versioned_docs/version-0.6/examples/crawl_all_links_on_website.mdx new file mode 100644 index 0000000000..f17c63920f --- /dev/null +++ b/website/versioned_docs/version-0.6/examples/crawl_all_links_on_website.mdx @@ -0,0 +1,33 @@ +--- +id: crawl-all-links-on-website +title: Crawl all links on website +--- + +import ApiLink from '@site/src/components/ApiLink'; +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; + +import BeautifulSoupExample from '!!raw-loader!roa-loader!./code_examples/crawl_all_links_on_website_bs.py'; +import PlaywrightExample from '!!raw-loader!roa-loader!./code_examples/crawl_all_links_on_website_pw.py'; + +This example uses the `enqueue_links` helper to add new links to the `RequestQueue` as the crawler navigates from page to page. By automatically discovering and enqueuing all links on a given page, the crawler can systematically scrape an entire website. This approach is ideal for web scraping tasks where you need to collect data from multiple interconnected pages. + +:::tip + +If no options are given, by default the method will only add links that are under the same subdomain. This behavior can be controlled with the `strategy` option, which is an instance of the `EnqueueStrategy` type alias. You can find more info about this option in the [Crawl website with relative links](./crawl-website-with-relative-links) example. + +::: + + + + + {BeautifulSoupExample} + + + + + {PlaywrightExample} + + + diff --git a/website/versioned_docs/version-0.6/examples/crawl_multiple_urls.mdx b/website/versioned_docs/version-0.6/examples/crawl_multiple_urls.mdx new file mode 100644 index 0000000000..2d3d370283 --- /dev/null +++ b/website/versioned_docs/version-0.6/examples/crawl_multiple_urls.mdx @@ -0,0 +1,27 @@ +--- +id: crawl-multiple-urls +title: Crawl multiple URLs +--- + +import ApiLink from '@site/src/components/ApiLink'; +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; + +import BeautifulSoupExample from '!!raw-loader!roa-loader!./code_examples/crawl_multiple_urls_bs.py'; +import PlaywrightExample from '!!raw-loader!roa-loader!./code_examples/crawl_multiple_urls_pw.py'; + +This example demonstrates how to crawl a specified list of URLs using different crawlers. You'll learn how to set up the crawler, define a request handler, and run the crawler with multiple URLs. This setup is useful for scraping data from multiple pages or websites concurrently. + + + + + {BeautifulSoupExample} + + + + + {PlaywrightExample} + + + diff --git a/website/versioned_docs/version-0.6/examples/crawl_specific_links_on_website.mdx b/website/versioned_docs/version-0.6/examples/crawl_specific_links_on_website.mdx new file mode 100644 index 0000000000..b350568421 --- /dev/null +++ b/website/versioned_docs/version-0.6/examples/crawl_specific_links_on_website.mdx @@ -0,0 +1,47 @@ +--- +id: crawl-specific-links-on-website +title: Crawl specific links on website +--- + +import ApiLink from '@site/src/components/ApiLink'; +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; + +import BeautifulSoupExample from '!!raw-loader!roa-loader!./code_examples/crawl_specific_links_on_website_bs.py'; +import PlaywrightExample from '!!raw-loader!roa-loader!./code_examples/crawl_specific_links_on_website_pw.py'; + +import BeautifulSoupExampleExtractAndAdd from '!!raw-loader!roa-loader!./code_examples/extract_and_add_specific_links_on_website_bs.py'; +import PlaywrightExampleExtractAndAdd from '!!raw-loader!roa-loader!./code_examples/extract_and_add_specific_links_on_website_pw.py'; + +This example demonstrates how to crawl a website while targeting specific patterns of links. By utilizing the `enqueue_links` helper, you can pass `include` or `exclude` parameters to improve your crawling strategy. This approach ensures that only the links matching the specified patterns are added to the `RequestQueue`. Both `include` and `exclude` support lists of globs or regular expressions. This functionality is great for focusing on relevant sections of a website and avoiding scraping unnecessary or irrelevant content. + + + + + {BeautifulSoupExample} + + + + + {PlaywrightExample} + + + + +## Even more control over the enqueued links + +`enqueue_links` is a convenience helper and internally it calls `extract_links` to find the links and `add_requests` to add them to the queue. If you need some additional custom filtering of the extracted links before enqueuing them, then consider using `extract_links` and `add_requests` instead of the `enqueue_links` + + + + + {BeautifulSoupExampleExtractAndAdd} + + + + + {PlaywrightExampleExtractAndAdd} + + + diff --git a/website/versioned_docs/version-0.6/examples/crawl_website_with_relative_links.mdx b/website/versioned_docs/version-0.6/examples/crawl_website_with_relative_links.mdx new file mode 100644 index 0000000000..4cf7bee845 --- /dev/null +++ b/website/versioned_docs/version-0.6/examples/crawl_website_with_relative_links.mdx @@ -0,0 +1,52 @@ +--- +id: crawl-website-with-relative-links +title: Crawl website with relative links +--- + +import ApiLink from '@site/src/components/ApiLink'; +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; + +import AllLinksExample from '!!raw-loader!roa-loader!./code_examples/crawl_website_with_relative_links_all_links.py'; +import SameDomainExample from '!!raw-loader!roa-loader!./code_examples/crawl_website_with_relative_links_same_domain.py'; +import SameHostnameExample from '!!raw-loader!roa-loader!./code_examples/crawl_website_with_relative_links_same_hostname.py'; +import SameOriginExample from '!!raw-loader!roa-loader!./code_examples/crawl_website_with_relative_links_same_origin.py'; + +When crawling a website, you may encounter various types of links that you wish to include in your crawl. To facilitate this, we provide the `enqueue_links` method on the crawler context, which will automatically find and add these links to the crawler's `RequestQueue`. This method simplifies the process of handling different types of links, including relative links, by automatically resolving them based on the page's context. + +:::note + +For these examples, we are using the `BeautifulSoupCrawler`. However, the same method is available for other crawlers as well. You can use it in exactly the same way. + +::: + +`EnqueueStrategy` type alias provides four distinct strategies for crawling relative links: + +- `all` - Enqueues all links found, regardless of the domain they point to. This strategy is useful when you want to follow every link, including those that navigate to external websites. +- `same-domain` - Enqueues all links found that share the same domain name, including any possible subdomains. This strategy ensures that all links within the same top-level and base domain are included. +- `same-hostname` - Enqueues all links found for the exact same hostname. This is the **default** strategy, and it restricts the crawl to links that have the same hostname as the current page, excluding subdomains. +- `same-origin` - Enqueues all links found that share the same origin. The same origin refers to URLs that share the same protocol, domain, and port, ensuring a strict scope for the crawl. + + + + + {AllLinksExample} + + + + + {SameDomainExample} + + + + + {SameHostnameExample} + + + + + {SameOriginExample} + + + diff --git a/website/versioned_docs/version-0.6/examples/crawler_keep_alive.mdx b/website/versioned_docs/version-0.6/examples/crawler_keep_alive.mdx new file mode 100644 index 0000000000..2e6c6640c7 --- /dev/null +++ b/website/versioned_docs/version-0.6/examples/crawler_keep_alive.mdx @@ -0,0 +1,15 @@ +--- +id: crawler-keep-alive +title: Keep a Crawler alive waiting for more requests +--- + +import ApiLink from '@site/src/components/ApiLink'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; + +import BeautifulSoupExample from '!!raw-loader!roa-loader!./code_examples/beautifulsoup_crawler_keep_alive.py'; + +This example demonstrates how to keep crawler alive even when there are no requests at the moment by using `keep_alive=True` argument of `BasicCrawler.__init__`. This is available to all crawlers that inherit from `BasicCrawler` and in the example below it is shown on `BeautifulSoupCrawler`. To stop the crawler that was started with `keep_alive=True` you can call `crawler.stop()`. + + + {BeautifulSoupExample} + diff --git a/website/versioned_docs/version-0.6/examples/crawler_stop.mdx b/website/versioned_docs/version-0.6/examples/crawler_stop.mdx new file mode 100644 index 0000000000..4ea7f28565 --- /dev/null +++ b/website/versioned_docs/version-0.6/examples/crawler_stop.mdx @@ -0,0 +1,15 @@ +--- +id: crawler-stop +title: Stopping a Crawler with stop method +--- + +import ApiLink from '@site/src/components/ApiLink'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; + +import BeautifulSoupExample from '!!raw-loader!roa-loader!./code_examples/beautifulsoup_crawler_stop.py'; + +This example demonstrates how to use `stop` method of `BasicCrawler` to stop crawler once the crawler finds what it is looking for. This method is available to all crawlers that inherit from `BasicCrawler` and in the example below it is shown on `BeautifulSoupCrawler`. Simply call `crawler.stop()` to stop the crawler. It will not continue to crawl through new requests. Requests that are already being concurrently processed are going to get finished. It is possible to call `stop` method with optional argument `reason` that is a string that will be used in logs and it can improve logs readability especially if you have multiple different conditions for triggering `stop`. + + + {BeautifulSoupExample} + diff --git a/website/versioned_docs/version-0.6/examples/export_entire_dataset_to_file.mdx b/website/versioned_docs/version-0.6/examples/export_entire_dataset_to_file.mdx new file mode 100644 index 0000000000..72418ebe66 --- /dev/null +++ b/website/versioned_docs/version-0.6/examples/export_entire_dataset_to_file.mdx @@ -0,0 +1,33 @@ +--- +id: export-entire-dataset-to-file +title: Export entire dataset to file +--- + +import ApiLink from '@site/src/components/ApiLink'; +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; + +import JsonExample from '!!raw-loader!roa-loader!./code_examples/export_entire_dataset_to_file_json.py'; +import CsvExample from '!!raw-loader!roa-loader!./code_examples/export_entire_dataset_to_file_csv.py'; + +This example demonstrates how to use the `BasicCrawler.export_data` method of the crawler to export the entire default dataset to a single file. This method supports exporting data in either CSV or JSON format. + +:::note + +For these examples, we are using the `BeautifulSoupCrawler`. However, the same method is available for other crawlers as well. You can use it in exactly the same way. + +::: + + + + + {JsonExample} + + + + + {CsvExample} + + + diff --git a/website/versioned_docs/version-0.6/examples/fill_and_submit_web_form.mdx b/website/versioned_docs/version-0.6/examples/fill_and_submit_web_form.mdx new file mode 100644 index 0000000000..841a2616ee --- /dev/null +++ b/website/versioned_docs/version-0.6/examples/fill_and_submit_web_form.mdx @@ -0,0 +1,113 @@ +--- +id: fill-and-submit-web-form +title: Fill and submit web form +--- + +import ApiLink from '@site/src/components/ApiLink'; +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; + +import RequestExample from '!!raw-loader!roa-loader!./code_examples/fill_and_submit_web_form_request.py'; +import CrawlerExample from '!!raw-loader!roa-loader!./code_examples/fill_and_submit_web_form_crawler.py'; + +This example demonstrates how to fill and submit a web form using the `HttpCrawler` crawler. The same approach applies to any crawler that inherits from it, such as the `BeautifulSoupCrawler` or `ParselCrawler`. + +We are going to use the [httpbin.org](https://httpbin.org) website to demonstrate how it works. + +## Investigate the form fields + +First, we need to examine the form fields and the form's action URL. You can do this by opening the [httpbin.org/forms/post](https://httpbin.org/forms/post) page in a browser and inspecting the form fields. + +In Chrome, right-click on the page and select "Inspect" or press `Ctrl+Shift+I`. +Use the element selector (`Ctrl+Shift+C`) to click on the form element you want to inspect. + +![HTML input element name](/img/fill-and-submit-web-form/00.jpg 'HTML input element name.') + +Identify the field names. For example, the customer name field is `custname`, the email field is `custemail`, and the phone field is `custtel`. + +Now navigate to the "Network" tab in developer tools and submit the form by clicking the "Submit order" button. + +![Submitting the form](/img/fill-and-submit-web-form/01.jpg 'Submitting the form.') + +Find the form submission request and examine its details. The "Headers" tab will show the submission URL, in this case, it is `https://httpbin.org/post`. + +![Network request investigation](/img/fill-and-submit-web-form/02.jpg 'Network request investigation.') + +The "Payload" tab will display the form fields and their submitted values. This method could be an alternative to inspecting the HTML source code directly. + +![Network payload investigation](/img/fill-and-submit-web-form/03.jpg 'Network payload investigation.') + +## Preparing a POST request + +Now, let's create a POST request with the form fields and their values using the `Request` class, specifically its `Request.from_url` constructor: + + + {RequestExample} + + +Alternatively, you can send form data as URL parameters using the `url` argument. It depends on the form and how it is implemented. However, sending the data as a POST request body using the `payload` is generally a better approach. + +## Implementing the crawler + +Finally, let's implement the crawler and run it with the prepared request. Although we are using the `HttpCrawler`, the process is the same for any crawler that inherits from it. + + + {CrawlerExample} + + +## Running the crawler + +Finally, run your crawler. Your logs should show something like this: + +```plaintext +... +[crawlee.http_crawler._http_crawler] INFO Processing https://httpbin.org/post ... +[crawlee.http_crawler._http_crawler] INFO Response: { + "args": {}, + "data": "", + "files": {}, + "form": { + "comments": "Please ring the doorbell upon arrival.", + "custemail": "johndoe@example.com", + "custname": "John Doe", + "custtel": "1234567890", + "delivery": "13:00", + "size": "large", + "topping": [ + "bacon", + "cheese", + "mushroom" + ] + }, + "headers": { + "Accept": "*/*", + "Accept-Encoding": "gzip, deflate, br", + "Content-Length": "190", + "Content-Type": "application/x-www-form-urlencoded", + "Host": "httpbin.org", + "User-Agent": "python-httpx/0.27.0", + "X-Amzn-Trace-Id": "Root=1-66c849d6-1ae432fb7b4156e6149ff37f" + }, + "json": null, + "origin": "78.80.81.196", + "url": "https://httpbin.org/post" +} + +[crawlee._autoscaling.autoscaled_pool] INFO Waiting for remaining tasks to finish +[crawlee.http_crawler._http_crawler] INFO Final request statistics: +โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” +โ”‚ requests_finished โ”‚ 1 โ”‚ +โ”‚ requests_failed โ”‚ 0 โ”‚ +โ”‚ retry_histogram โ”‚ [1] โ”‚ +โ”‚ request_avg_failed_duration โ”‚ None โ”‚ +โ”‚ request_avg_finished_duration โ”‚ 0.678442 โ”‚ +โ”‚ requests_finished_per_minute โ”‚ 85 โ”‚ +โ”‚ requests_failed_per_minute โ”‚ 0 โ”‚ +โ”‚ request_total_duration โ”‚ 0.678442 โ”‚ +โ”‚ requests_total โ”‚ 1 โ”‚ +โ”‚ crawler_runtime โ”‚ 0.707666 โ”‚ +โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ +``` + +This log output confirms that the crawler successfully submitted the form and processed the response. Congratulations! You have successfully filled and submitted a web form using the `HttpCrawler`. diff --git a/website/versioned_docs/version-0.6/examples/json_logging.mdx b/website/versioned_docs/version-0.6/examples/json_logging.mdx new file mode 100644 index 0000000000..06dd2ac492 --- /dev/null +++ b/website/versioned_docs/version-0.6/examples/json_logging.mdx @@ -0,0 +1,57 @@ +--- +id: configure-json-logging +title: ะกonfigure JSON logging +--- + +import ApiLink from '@site/src/components/ApiLink'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; + +import JsonLoggingExample from '!!raw-loader!roa-loader!./code_examples/configure_json_logging.py'; + +This example demonstrates how to configure JSON line (JSONL) logging with Crawlee. By using the `use_table_logs=False` parameter, you can disable table-formatted statistics logs, which makes it easier to parse logs with external tools or to serialize them as JSON. + +The example shows how to integrate with the popular [`loguru`](https://github.com/delgan/loguru) library to capture Crawlee logs and format them as JSONL (one JSON object per line). This approach works well when you need to collect logs for analysis, monitoring, or when integrating with logging platforms like ELK Stack, Grafana Loki, or similar systems. + + + {JsonLoggingExample} + + +Here's an example of what a crawler statistics log entry in JSONL format. + +```json +{ + "text": "[HttpCrawler] | INFO | - Final request statistics: {'requests_finished': 1, 'requests_failed': 0, 'retry_histogram': [1], 'request_avg_failed_duration': None, 'request_avg_finished_duration': 3.57098, 'requests_finished_per_minute': 17, 'requests_failed_per_minute': 0, 'request_total_duration': 3.57098, 'requests_total': 1, 'crawler_runtime': 3.59165}\n", + "record": { + "elapsed": { "repr": "0:00:05.604568", "seconds": 5.604568 }, + "exception": null, + "extra": { + "requests_finished": 1, + "requests_failed": 0, + "retry_histogram": [1], + "request_avg_failed_duration": null, + "request_avg_finished_duration": 3.57098, + "requests_finished_per_minute": 17, + "requests_failed_per_minute": 0, + "request_total_duration": 3.57098, + "requests_total": 1, + "crawler_runtime": 3.59165 + }, + "file": { + "name": "_basic_crawler.py", + "path": "/crawlers/_basic/_basic_crawler.py" + }, + "function": "run", + "level": { "icon": "โ„น๏ธ", "name": "INFO", "no": 20 }, + "line": 583, + "message": "Final request statistics:", + "module": "_basic_crawler", + "name": "HttpCrawler", + "process": { "id": 198383, "name": "MainProcess" }, + "thread": { "id": 135312814966592, "name": "MainThread" }, + "time": { + "repr": "2025-03-17 17:14:45.339150+00:00", + "timestamp": 1742231685.33915 + } + } +} +``` diff --git a/website/versioned_docs/version-0.6/examples/parsel_crawler.mdx b/website/versioned_docs/version-0.6/examples/parsel_crawler.mdx new file mode 100644 index 0000000000..b0eca7eb28 --- /dev/null +++ b/website/versioned_docs/version-0.6/examples/parsel_crawler.mdx @@ -0,0 +1,15 @@ +--- +id: parsel-crawler +title: Parsel crawler +--- + +import ApiLink from '@site/src/components/ApiLink'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; + +import ParselCrawlerExample from '!!raw-loader!roa-loader!./code_examples/parsel_crawler.py'; + +This example shows how to use `ParselCrawler` to crawl a website or a list of URLs. Each URL is loaded using a plain HTTP request and the response is parsed using [Parsel](https://pypi.org/project/parsel/) library which supports CSS and XPath selectors for HTML responses and JMESPath for JSON responses. We can extract data from all kinds of complex HTML structures using XPath. In this example, we will use Parsel to crawl github.com and extract page title, URL and emails found in the webpage. The default handler will scrape data from the current webpage and enqueue all the links found in the webpage for continuous scraping. It also shows how you can add optional pre-navigation hook to the crawler. Pre-navigation hooks are user defined functions that execute before sending the request. + + + {ParselCrawlerExample} + diff --git a/website/versioned_docs/version-0.6/examples/playwright_crawler.mdx b/website/versioned_docs/version-0.6/examples/playwright_crawler.mdx new file mode 100644 index 0000000000..70b0bc8afb --- /dev/null +++ b/website/versioned_docs/version-0.6/examples/playwright_crawler.mdx @@ -0,0 +1,19 @@ +--- +id: playwright-crawler +title: Playwright crawler +--- + +import ApiLink from '@site/src/components/ApiLink'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; + +import PlaywrightCrawlerExample from '!!raw-loader!roa-loader!./code_examples/playwright_crawler.py'; + +This example demonstrates how to use `PlaywrightCrawler` to recursively scrape the Hacker news website using headless Chromium and Playwright. + +The `PlaywrightCrawler` manages the browser and page instances, simplifying the process of interacting with web pages. In the request handler, Playwright's API is used to extract data from each post on the page. Specifically, it retrieves the title, rank, and URL of each post. Additionally, the handler enqueues links to the next pages to ensure continuous scraping. This setup is ideal for scraping dynamic web pages where JavaScript execution is required to render the content. + +A **pre-navigation hook** can be used to perform actions before navigating to the URL. This hook provides further flexibility in controlling environment and preparing for navigation. + + + {PlaywrightCrawlerExample} + diff --git a/website/versioned_docs/version-0.6/examples/playwright_crawler_adaptive.mdx b/website/versioned_docs/version-0.6/examples/playwright_crawler_adaptive.mdx new file mode 100644 index 0000000000..c1f8875df8 --- /dev/null +++ b/website/versioned_docs/version-0.6/examples/playwright_crawler_adaptive.mdx @@ -0,0 +1,20 @@ +--- +id: adaptive-playwright-crawler +title: AdaptivePlaywrightCrawler +--- + +import ApiLink from '@site/src/components/ApiLink'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; + +import AdaptivePlaywrightCrawlerExample from '!!raw-loader!roa-loader!./code_examples/adaptive_playwright_crawler.py'; + +This example demonstrates how to use `AdaptivePlaywrightCrawler`. An `AdaptivePlaywrightCrawler` is a combination of `PlaywrightCrawler` and some implementation of HTTP-based crawler such as `ParselCrawler` or `BeautifulSoupCrawler`. +It uses a more limited crawling context interface so that it is able to switch to HTTP-only crawling when it detects that it may bring a performance benefit. + +A [pre-navigation hook](/python/docs/guides/adaptive-playwright-crawler#page-configuration-with-pre-navigation-hooks) can be used to perform actions before navigating to the URL. This hook provides further flexibility in controlling environment and preparing for navigation. Hooks will be executed both for the pages crawled by HTTP-bases sub crawler and playwright based sub crawler. Use `playwright_only=True` to mark hooks that should be executed only for playwright sub crawler. + +For more detailed description please see [AdaptivePlaywrightCrawler guide](/python/docs/guides/adaptive-playwright-crawler 'AdaptivePlaywrightCrawler guide') + + + {AdaptivePlaywrightCrawlerExample} + diff --git a/website/versioned_docs/version-0.6/examples/playwright_crawler_with_block_requests.mdx b/website/versioned_docs/version-0.6/examples/playwright_crawler_with_block_requests.mdx new file mode 100644 index 0000000000..d7d5e15928 --- /dev/null +++ b/website/versioned_docs/version-0.6/examples/playwright_crawler_with_block_requests.mdx @@ -0,0 +1,27 @@ +--- +id: playwright-crawler-with-block-requests +title: Playwright crawler with block requests +--- + +import ApiLink from '@site/src/components/ApiLink'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; + +import PlaywrightBlockRequests from '!!raw-loader!roa-loader!./code_examples/playwright_block_requests.py'; + +This example demonstrates how to optimize your `PlaywrightCrawler` performance by blocking unnecessary network requests. + +The primary use case is when you need to scrape or interact with web pages without loading non-essential resources like images, styles, or analytics scripts. This can significantly reduce bandwidth usage and improve crawling speed. + +The `block_requests` helper provides the most efficient way to block requests as it operates directly in the browser. + +By default, `block_requests` will block all URLs including the following patterns: + +```python +['.css', '.webp', '.jpg', '.jpeg', '.png', '.svg', '.gif', '.woff', '.pdf', '.zip'] +``` + +You can also replace the default patterns list with your own by providing `url_patterns`, or extend it by passing additional patterns in `extra_url_patterns`. + + + {PlaywrightBlockRequests} + diff --git a/website/versioned_docs/version-0.6/examples/playwright_crawler_with_camoufox.mdx b/website/versioned_docs/version-0.6/examples/playwright_crawler_with_camoufox.mdx new file mode 100644 index 0000000000..b627c9ba34 --- /dev/null +++ b/website/versioned_docs/version-0.6/examples/playwright_crawler_with_camoufox.mdx @@ -0,0 +1,26 @@ +--- +id: playwright-crawler-with-camoufox +title: Playwright crawler with Camoufox +--- + +import ApiLink from '@site/src/components/ApiLink'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; + +import PlaywrightCrawlerExampleWithCamoufox from '!!raw-loader!roa-loader!./code_examples/playwright_crawler_with_camoufox.py'; + +This example demonstrates how to integrate Camoufox into `PlaywrightCrawler` using `BrowserPool` with custom `PlaywrightBrowserPlugin`. + +Camoufox is a stealthy minimalistic build of Firefox. For details please visit its homepage https://camoufox.com/ . +To be able to run this example you will need to install camoufox, as it is external tool, and it is not part of the crawlee. For installation please see https://pypi.org/project/camoufox/. + +**Warning!** Camoufox is using custom build of firefox. This build can be hundreds of MB large. +You can either pre-download this file using following command `python3 -m camoufox fetch` or camoufox will download it automatically once you try to run it, and it does not find existing binary. +For more details please refer to: https://github.com/daijro/camoufox/tree/main/pythonlib#camoufox-python-interface + +**Project template -** It is possible to generate project with Python code which includes Camoufox integration into crawlee through crawlee cli. Call `crawlee create` and pick `Playwright-camoufox` when asked for Crawler type. + +The example code after PlayWrightCrawler instantiation is similar to example describing the use of Playwright Crawler. The main difference is that in this example Camoufox will be used as the browser through BrowserPool. + + + {PlaywrightCrawlerExampleWithCamoufox} + diff --git a/website/versioned_docs/version-0.6/examples/playwright_crawler_with_fingerprint_generator.mdx b/website/versioned_docs/version-0.6/examples/playwright_crawler_with_fingerprint_generator.mdx new file mode 100644 index 0000000000..04727cd74c --- /dev/null +++ b/website/versioned_docs/version-0.6/examples/playwright_crawler_with_fingerprint_generator.mdx @@ -0,0 +1,17 @@ +--- +id: playwright-crawler-with-fingerprint-generator +title: Playwright crawler with fingerprint generator +--- + +import ApiLink from '@site/src/components/ApiLink'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; + +import PlaywrightCrawlerExample from '!!raw-loader!roa-loader!./code_examples/playwright_crawler_with_fingerprint_generator.py'; + +This example demonstrates how to use `PlaywrightCrawler` together with `FingerprintGenerator` that will populate several browser attributes to mimic real browser fingerprint. To read more about fingerprints please see: https://docs.apify.com/academy/anti-scraping/techniques/fingerprinting. + +You can implement your own fingerprint generator or use `DefaultFingerprintGenerator`. To use the generator initialize it with the desired fingerprint options. The generator will try to create fingerprint based on those options. Unspecified options will be automatically selected by the generator from the set of reasonable values. If some option is important for you, do not rely on the default and explicitly define it. + + + {PlaywrightCrawlerExample} + diff --git a/website/versioned_docs/version-0.6/examples/respect_robots_txt_file.mdx b/website/versioned_docs/version-0.6/examples/respect_robots_txt_file.mdx new file mode 100644 index 0000000000..dc509e16b8 --- /dev/null +++ b/website/versioned_docs/version-0.6/examples/respect_robots_txt_file.mdx @@ -0,0 +1,32 @@ +--- +id: respect-robots-txt-file +title: Respect robots.txt file +--- + +import ApiLink from '@site/src/components/ApiLink'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; + +import RespectRobotsTxt from '!!raw-loader!roa-loader!./code_examples/respect_robots_txt_file.py'; +import OnSkippedRequest from '!!raw-loader!roa-loader!./code_examples/respect_robots_on_skipped_request.py'; + +This example demonstrates how to configure your crawler to respect the rules established by websites for crawlers as described in the [robots.txt](https://www.robotstxt.org/robotstxt.html) file. + +To configure `Crawlee` to follow the `robots.txt` file, set the parameter `respect_robots_txt_file=True` in `BasicCrawlerOptions`. In this case, `Crawlee` will skip any URLs forbidden in the website's robots.txt file. + +As an example, let's look at the website `https://news.ycombinator.com/` and its corresponding [robots.txt](https://news.ycombinator.com/robots.txt) file. Since the file has a rule `Disallow: /login`, the URL `https://news.ycombinator.com/login` will be automatically skipped. + +The code below demonstrates this behavior using the `BeautifulSoupCrawler`: + + + {RespectRobotsTxt} + + +## Handle with `on_skipped_request` + +If you want to process URLs skipped according to the `robots.txt` rules, for example for further analysis, you should use the `on_skipped_request` handler from `BasicCrawler`. + +Let's update the code by adding the `on_skipped_request` handler: + + + {OnSkippedRequest} + diff --git a/website/versioned_docs/version-0.6/examples/resuming_paused_crawl.mdx b/website/versioned_docs/version-0.6/examples/resuming_paused_crawl.mdx new file mode 100644 index 0000000000..8d2213d11d --- /dev/null +++ b/website/versioned_docs/version-0.6/examples/resuming_paused_crawl.mdx @@ -0,0 +1,35 @@ +--- +id: resuming-paused-crawl +title: Resuming a paused crawl +--- + +import ApiLink from '@site/src/components/ApiLink'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; + +import ResumeCrawl from '!!raw-loader!roa-loader!./code_examples/resuming_paused_crawl.py'; + +This example demonstrates how to resume crawling from its last state when running locally, if for some reason it was unexpectedly terminated. + +If each run should continue crawling from the previous state, you can configure this using `purge_on_start` in `Configuration`. + +Use the code below and perform 2 sequential runs. During the 1st run, stop the crawler by pressing `CTRL+C`, and the 2nd run will resume crawling from where it stopped. + + + {ResumeCrawl} + + +Perform the 1st run, interrupting the crawler with `CTRL+C` after 2 links have been processed. + +![Run with interruption](/img/resuming-paused-crawl/00.webp 'Run with interruption.') + +Now resume crawling after the pause to process the remaining 3 links. + +![Resuming crawling](/img/resuming-paused-crawl/01.webp 'Resuming crawling.') + +Alternatively, use the environment variable `CRAWLEE_PURGE_ON_START=0` instead of using `configuration.purge_on_start = False`. + +For example, when running code: + +```bash +CRAWLEE_PURGE_ON_START=0 python -m best_crawler +``` diff --git a/website/versioned_docs/version-0.6/guides/avoid_blocking.mdx b/website/versioned_docs/version-0.6/guides/avoid_blocking.mdx new file mode 100644 index 0000000000..3ada4e2446 --- /dev/null +++ b/website/versioned_docs/version-0.6/guides/avoid_blocking.mdx @@ -0,0 +1,47 @@ +--- +id: avoid-blocking +title: Avoid getting blocked +description: How to avoid getting blocked when scraping +--- + +import ApiLink from '@site/src/components/ApiLink'; +import CodeBlock from '@theme/CodeBlock'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; + +import PlaywrightDefaultFingerprintGenerator from '!!raw-loader!roa-loader!./code_examples/avoid_blocking/playwright_with_fingerprint_generator.py'; +import PlaywrightWithCamoufox from '!!raw-loader!roa-loader!../examples/code_examples/playwright_crawler_with_camoufox.py'; + +import PlaywrightDefaultFingerprintGeneratorWithArgs from '!!raw-loader!./code_examples/avoid_blocking/default_fingerprint_generator_with_args.py'; + +A scraper might get blocked for numerous reasons. Let's narrow it down to the two main ones. The first is a bad or blocked IP address. You can learn about this topic in the [proxy management guide](./proxy-management). The second reason is [browser fingerprints](https://pixelprivacy.com/resources/browser-fingerprinting/) (or signatures), which we will explore more in this guide. Check the [Apify Academy anti-scraping course](https://docs.apify.com/academy/anti-scraping) to gain a deeper theoretical understanding of blocking and learn a few tips and tricks. + +Browser fingerprint is a collection of browser attributes and significant features that can show if our browser is a bot or a real user. Moreover, most browsers have these unique features that allow the website to track the browser even within different IP addresses. This is the main reason why scrapers should change browser fingerprints while doing browser-based scraping. In return, it should significantly reduce the blocking. + +## Using browser fingerprints + +Changing browser fingerprints can be a tedious job. Luckily, Crawlee provides this feature with minimal configuration necessary - the usage of fingerprints in `PlaywrightCrawler` is enabled by default. You can customize the fingerprints by using the `fingerprint_generator` argument of the `PlaywrightCrawler.__init__`, either pass your own implementation of `FingerprintGenerator` or use `DefaultFingerprintGenerator`. + + + {PlaywrightDefaultFingerprintGenerator} + + +In certain cases we want to narrow down the fingerprints used - e.g. specify a certain operating system, locale or browser. This is also possible with Crawlee - the crawler can have the generation algorithm customized to reflect the particular browser version and many more. For description of fingerprint generation options please see `HeaderGeneratorOptions`, `ScreenOptions` and `DefaultFingerprintGenerator.__init__` See the example bellow: + + + {PlaywrightDefaultFingerprintGeneratorWithArgs} + + +If you do not want to use fingerprints, then pass `fingerprint_generator=None` argument to the `PlaywrightCrawler.__init__`. + +## Using Camoufox + +In some cases even `PlaywrightCrawler` with fingerprints is not enough. You can try using `PlaywrightCrawler` together with [Camoufox](https://camoufox.com/). See the example integration below: + + + {PlaywrightWithCamoufox} + + +**Related links** + +- [Fingerprint Suite Docs](https://github.com/apify/fingerprint-suite) +- [Apify Academy anti-scraping course](https://docs.apify.com/academy/anti-scraping) diff --git a/website/versioned_docs/version-0.6/guides/code_examples/avoid_blocking/default_fingerprint_generator_with_args.py b/website/versioned_docs/version-0.6/guides/code_examples/avoid_blocking/default_fingerprint_generator_with_args.py new file mode 100644 index 0000000000..a6d2072ad3 --- /dev/null +++ b/website/versioned_docs/version-0.6/guides/code_examples/avoid_blocking/default_fingerprint_generator_with_args.py @@ -0,0 +1,20 @@ +import asyncio + +from crawlee.fingerprint_suite import ( + DefaultFingerprintGenerator, + HeaderGeneratorOptions, + ScreenOptions, +) + + +async def main() -> None: + fingerprint_generator = DefaultFingerprintGenerator( + header_options=HeaderGeneratorOptions(browsers=['chromium']), + screen_options=ScreenOptions(min_width=400), + ) + + # ... + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/guides/code_examples/avoid_blocking/playwright_with_fingerprint_generator.py b/website/versioned_docs/version-0.6/guides/code_examples/avoid_blocking/playwright_with_fingerprint_generator.py new file mode 100644 index 0000000000..5e1c8d2668 --- /dev/null +++ b/website/versioned_docs/version-0.6/guides/code_examples/avoid_blocking/playwright_with_fingerprint_generator.py @@ -0,0 +1,23 @@ +import asyncio + +from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext + + +async def main() -> None: + # Fingerprint generator is used by default. + crawler = PlaywrightCrawler() + + # Define the default request handler, which will be called for every request. + @crawler.router.default_handler + async def request_handler(context: PlaywrightCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # Find a link to the next page and enqueue it if it exists. + await context.enqueue_links(selector='.morelink') + + # Run the crawler with the initial list of URLs. + await crawler.run(['https://news.ycombinator.com/']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/guides/code_examples/error_handling/change_handle_error_status.py b/website/versioned_docs/version-0.6/guides/code_examples/error_handling/change_handle_error_status.py new file mode 100644 index 0000000000..3b721545b2 --- /dev/null +++ b/website/versioned_docs/version-0.6/guides/code_examples/error_handling/change_handle_error_status.py @@ -0,0 +1,47 @@ +import asyncio +import json + +from crawlee import HttpHeaders +from crawlee.crawlers import HttpCrawler, HttpCrawlingContext +from crawlee.errors import HttpStatusCodeError +from crawlee.sessions import SessionPool + +# Using a placeholder refresh token for this example +REFRESH_TOKEN = 'PLACEHOLDER' +UNAUTHORIZED_CODE = 401 + + +async def main() -> None: + crawler = HttpCrawler( + max_request_retries=2, + # Only treat 403 as a blocking status code, not 401 + session_pool=SessionPool(create_session_settings={'blocked_status_codes': [403]}), + # Don't treat 401 responses as errors + ignore_http_error_status_codes=[UNAUTHORIZED_CODE], + ) + + @crawler.router.default_handler + async def default_handler(context: HttpCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + # Now we can handle 401 responses ourselves + if context.http_response.status_code == UNAUTHORIZED_CODE: + # Get a fresh access token + headers = {'authorization': f'Bearer {REFRESH_TOKEN}'} + response = await context.send_request( + 'https://placeholder.org/refresh', headers=headers + ) + data = json.loads(response.read()) + # Add the new token to our `Request` headers + new_headers = { + **context.request.headers, + 'authorization': f'Bearer {data["access_token"]}', + } + context.request.headers = HttpHeaders(new_headers) + # Trigger a retry with our updated headers + raise HttpStatusCodeError('Unauthorized', status_code=UNAUTHORIZED_CODE) + + await crawler.run(['http://httpbingo.org/status/401']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/guides/code_examples/error_handling/disable_retry.py b/website/versioned_docs/version-0.6/guides/code_examples/error_handling/disable_retry.py new file mode 100644 index 0000000000..8d98eff312 --- /dev/null +++ b/website/versioned_docs/version-0.6/guides/code_examples/error_handling/disable_retry.py @@ -0,0 +1,30 @@ +import asyncio + +from crawlee.crawlers import BasicCrawlingContext, HttpCrawler, HttpCrawlingContext +from crawlee.errors import HttpStatusCodeError, SessionError + + +async def main() -> None: + crawler = HttpCrawler(max_request_retries=5) + + # Create a parsing error for demonstration + @crawler.router.default_handler + async def default_handler(context: HttpCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + raise ValueError('Simulated parsing error') + + # This handler runs before any retry attempts + @crawler.error_handler + async def retry_handler(context: BasicCrawlingContext, error: Exception) -> None: + context.log.error(f'Failed request {context.request.url}') + # Only allow retries for network-related errors + if not isinstance(error, (SessionError, HttpStatusCodeError)): + context.log.error('Non-network error detected') + # Stop further retry attempts for this `Request` + context.request.no_retry = True + + await crawler.run(['https://crawlee.dev/']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/guides/code_examples/error_handling/handle_proxy_error.py b/website/versioned_docs/version-0.6/guides/code_examples/error_handling/handle_proxy_error.py new file mode 100644 index 0000000000..eddb843fdd --- /dev/null +++ b/website/versioned_docs/version-0.6/guides/code_examples/error_handling/handle_proxy_error.py @@ -0,0 +1,40 @@ +import asyncio + +from crawlee import Request +from crawlee.crawlers import BasicCrawlingContext, HttpCrawler, HttpCrawlingContext +from crawlee.errors import ProxyError + + +async def main() -> None: + # Set how many session rotations will happen before calling the error handler + # when ProxyError occurs + crawler = HttpCrawler(max_session_rotations=5, max_request_retries=6) + + # For this example, we'll create a proxy error in our handler + @crawler.router.default_handler + async def default_handler(context: HttpCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + raise ProxyError('Simulated proxy error') + + # This handler runs after all retry attempts are exhausted + @crawler.failed_request_handler + async def failed_handler(context: BasicCrawlingContext, error: Exception) -> None: + context.log.error(f'Failed request {context.request.url}, after 5 rotations') + request = context.request + # For proxy errors, we can add a new `Request` to try again + if isinstance(error, ProxyError) and not request.unique_key.startswith('retry'): + context.log.info(f'Retrying {request.url} ...') + # Create a new `Request` with a modified key to avoid deduplication + new_request = Request.from_url( + request.url, unique_key=f'retry{request.unique_key}' + ) + + # Add the new `Request` to the `Queue` + rq = await crawler.get_request_manager() + await rq.add_request(new_request) + + await crawler.run(['https://crawlee.dev/']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/guides/code_examples/http_clients/curl_impersonate_example.py b/website/versioned_docs/version-0.6/guides/code_examples/http_clients/curl_impersonate_example.py new file mode 100644 index 0000000000..28813a2f46 --- /dev/null +++ b/website/versioned_docs/version-0.6/guides/code_examples/http_clients/curl_impersonate_example.py @@ -0,0 +1,42 @@ +import asyncio + +from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext +from crawlee.http_clients import CurlImpersonateHttpClient + + +async def main() -> None: + http_client = CurlImpersonateHttpClient( + # Optional additional keyword arguments for `curl_cffi.requests.AsyncSession`. + timeout=10, + impersonate='chrome131', + ) + + crawler = BeautifulSoupCrawler( + http_client=http_client, + # Limit the crawl to max requests. Remove or increase it for crawling all links. + max_requests_per_crawl=10, + ) + + # Define the default request handler, which will be called for every request. + @crawler.router.default_handler + async def request_handler(context: BeautifulSoupCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # Enqueue all links from the page. + await context.enqueue_links() + + # Extract data from the page. + data = { + 'url': context.request.url, + 'title': context.soup.title.string if context.soup.title else None, + } + + # Push the extracted data to the default dataset. + await context.push_data(data) + + # Run the crawler with the initial list of URLs. + await crawler.run(['https://crawlee.dev']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/guides/code_examples/http_clients/httpx_example.py b/website/versioned_docs/version-0.6/guides/code_examples/http_clients/httpx_example.py new file mode 100644 index 0000000000..5c3c4883cb --- /dev/null +++ b/website/versioned_docs/version-0.6/guides/code_examples/http_clients/httpx_example.py @@ -0,0 +1,42 @@ +import asyncio + +from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext +from crawlee.http_clients import HttpxHttpClient + + +async def main() -> None: + http_client = HttpxHttpClient( + # Optional additional keyword arguments for `httpx.AsyncClient`. + timeout=10, + follow_redirects=True, + ) + + crawler = BeautifulSoupCrawler( + http_client=http_client, + # Limit the crawl to max requests. Remove or increase it for crawling all links. + max_requests_per_crawl=10, + ) + + # Define the default request handler, which will be called for every request. + @crawler.router.default_handler + async def request_handler(context: BeautifulSoupCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # Enqueue all links from the page. + await context.enqueue_links() + + # Extract data from the page. + data = { + 'url': context.request.url, + 'title': context.soup.title.string if context.soup.title else None, + } + + # Push the extracted data to the default dataset. + await context.push_data(data) + + # Run the crawler with the initial list of URLs. + await crawler.run(['https://crawlee.dev']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/guides/code_examples/login_crawler/http_login.py b/website/versioned_docs/version-0.6/guides/code_examples/login_crawler/http_login.py new file mode 100644 index 0000000000..5da5781045 --- /dev/null +++ b/website/versioned_docs/version-0.6/guides/code_examples/login_crawler/http_login.py @@ -0,0 +1,85 @@ +import asyncio +import json +from datetime import datetime, timedelta + +from crawlee import ConcurrencySettings, Request +from crawlee.crawlers import ( + HttpCrawler, + HttpCrawlingContext, +) +from crawlee.sessions import SessionPool + + +async def main() -> None: + crawler = HttpCrawler( + max_requests_per_crawl=10, + # Configure to use a single persistent session throughout the crawl + max_session_rotations=0, + # Limit request rate to avoid triggering anti-scraping measures + concurrency_settings=ConcurrencySettings(max_tasks_per_minute=30), + session_pool=SessionPool( + max_pool_size=1, + create_session_settings={ + # Set high value to ensure the session isn't replaced during crawling + 'max_usage_count': 999_999, + # Set high value to prevent session expiration during crawling + 'max_age': timedelta(hours=999_999), + # Higher error tolerance before the session is considered blocked + # Make sure you implement proper error handling in your code + 'max_error_score': 100, + }, + ), + ) + + # Default request handler for normal page processing + @crawler.router.default_handler + async def request_handler(context: HttpCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # Specialized handler for the login API request + @crawler.router.handler('login') + async def login_handler(context: HttpCrawlingContext) -> None: + context.log.info(f'Processing login at {context.request.url} ...') + + # Verify that a session is available before proceeding + if not context.session: + raise RuntimeError('Session not found') + + # Parse the API response containing authentication tokens and user data + data = json.loads(context.http_response.read()) + + # Extract authentication data from the response + token = data['token'] + expires = data['expires'].replace('Z', '+00:00') + expires_int = int(datetime.fromisoformat(expires).timestamp()) + user_id = data['userId'] + username = data['username'] + + # Set authentication cookies in the session that will be used + # for subsequent requests + context.session.cookies.set(name='token', value=token, expires=expires_int) + context.session.cookies.set(name='userID', value=user_id) + context.session.cookies.set(name='userName', value=username) + + # After successful authentication, continue crawling with the + # authenticated session + await context.add_requests(['https://demoqa.com/BookStore/v1/Books']) + + # Create a POST request to the authentication API endpoint + # This will trigger the login_handler when executed + request = Request.from_url( + 'https://demoqa.com/Account/v1/Login', + label='login', + method='POST', + payload=json.dumps( + {'userName': 'crawlee_test', 'password': 'Test1234!'} + ).encode(), + headers={'Content-Type': 'application/json'}, + ) + + # Start the crawling process with the login request + await crawler.run([request]) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/guides/code_examples/login_crawler/playwright_login.py b/website/versioned_docs/version-0.6/guides/code_examples/login_crawler/playwright_login.py new file mode 100644 index 0000000000..9530fc1e00 --- /dev/null +++ b/website/versioned_docs/version-0.6/guides/code_examples/login_crawler/playwright_login.py @@ -0,0 +1,70 @@ +import asyncio +from datetime import timedelta + +from crawlee import ConcurrencySettings, Request +from crawlee.crawlers import ( + PlaywrightCrawler, + PlaywrightCrawlingContext, +) +from crawlee.sessions import SessionPool + + +async def main() -> None: + crawler = PlaywrightCrawler( + max_requests_per_crawl=10, + headless=True, + browser_type='chromium', + # We only have one session and it shouldn't rotate + max_session_rotations=0, + # Limit crawling intensity to avoid blocking + concurrency_settings=ConcurrencySettings(max_tasks_per_minute=30), + session_pool=SessionPool( + # Limit the pool to one session + max_pool_size=1, + create_session_settings={ + # High value for session usage limit + 'max_usage_count': 999_999, + # High value for session lifetime + 'max_age': timedelta(hours=999_999), + # High score allows the session to encounter more errors + # before crawlee decides the session is blocked + # Make sure you know how to handle these errors + 'max_error_score': 100, + }, + ), + ) + + # The main handler for processing requests + @crawler.router.default_handler + async def request_handler(context: PlaywrightCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # A handler for the login page + @crawler.router.handler('login') + async def login_handler(context: PlaywrightCrawlingContext) -> None: + context.log.info(f'Processing login {context.request.url} ...') + + # Check if the session is available + if not context.session: + raise RuntimeError('Session not found') + + # Entering data into the form, `delay` to simulate human typing + # Without this, the data will be entered instantly + await context.page.type('#userName', 'crawlee_test', delay=100) + await context.page.type('#password', 'Test1234!', delay=100) + await context.page.click('#login', delay=100) + + # Wait for an element confirming that we have successfully + # logged in to the site + await context.page.locator('#userName-value').first.wait_for(state='visible') + context.log.info('Login successful!') + + # Moving on to the basic flow of crawling + await context.add_requests(['https://demoqa.com/books']) + + # We start crawling with login. This is necessary to access the rest of the pages + await crawler.run([Request.from_url('https://demoqa.com/login', label='login')]) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/guides/code_examples/playwright_crawler/browser_configuration_example.py b/website/versioned_docs/version-0.6/guides/code_examples/playwright_crawler/browser_configuration_example.py new file mode 100644 index 0000000000..10ff84eba0 --- /dev/null +++ b/website/versioned_docs/version-0.6/guides/code_examples/playwright_crawler/browser_configuration_example.py @@ -0,0 +1,43 @@ +import asyncio + +from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext + + +async def main() -> None: + crawler = PlaywrightCrawler( + headless=False, + browser_type='chromium', + # Browser launch options + browser_launch_options={ + # For support `msedge` channel you need to install it + # `playwright install msedge` + 'channel': 'msedge', + 'slow_mo': 200, + }, + # Context launch options, applied to each page as it is created + browser_new_context_options={ + 'color_scheme': 'dark', + # Set headers + 'extra_http_headers': { + 'Custom-Header': 'my-header', + 'Accept-Language': 'en', + }, + # Set only User Agent + 'user_agent': 'My-User-Agent', + }, + # Limit the crawl to max requests. Remove or increase it for crawling all links. + max_requests_per_crawl=10, + ) + + @crawler.router.default_handler + async def request_handler(context: PlaywrightCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + await context.enqueue_links() + + # Run the crawler with the initial list of URLs. + await crawler.run(['https://crawlee.dev']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/guides/code_examples/playwright_crawler/multiple_launch_example.py b/website/versioned_docs/version-0.6/guides/code_examples/playwright_crawler/multiple_launch_example.py new file mode 100644 index 0000000000..59219b89c7 --- /dev/null +++ b/website/versioned_docs/version-0.6/guides/code_examples/playwright_crawler/multiple_launch_example.py @@ -0,0 +1,38 @@ +import asyncio + +from crawlee.browsers import BrowserPool, PlaywrightBrowserPlugin +from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext + + +async def main() -> None: + # Create a plugin for each required browser. + plugin_chromium = PlaywrightBrowserPlugin( + browser_type='chromium', max_open_pages_per_browser=1 + ) + plugin_firefox = PlaywrightBrowserPlugin( + browser_type='firefox', max_open_pages_per_browser=1 + ) + + crawler = PlaywrightCrawler( + browser_pool=BrowserPool(plugins=[plugin_chromium, plugin_firefox]), + # Limit the crawl to max requests. Remove or increase it for crawling all links. + max_requests_per_crawl=10, + ) + + @crawler.router.default_handler + async def request_handler(context: PlaywrightCrawlingContext) -> None: + browser_name = ( + context.page.context.browser.browser_type.name + if context.page.context.browser + else 'undefined' + ) + context.log.info(f'Processing {context.request.url} with {browser_name} ...') + + await context.enqueue_links() + + # Run the crawler with the initial list of URLs. + await crawler.run(['https://crawlee.dev', 'https://apify.com/']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/guides/code_examples/playwright_crawler/plugin_browser_configuration_example.py b/website/versioned_docs/version-0.6/guides/code_examples/playwright_crawler/plugin_browser_configuration_example.py new file mode 100644 index 0000000000..6db2fb589d --- /dev/null +++ b/website/versioned_docs/version-0.6/guides/code_examples/playwright_crawler/plugin_browser_configuration_example.py @@ -0,0 +1,35 @@ +import asyncio + +from crawlee.browsers import BrowserPool, PlaywrightBrowserPlugin +from crawlee.crawlers import PlaywrightCrawler + + +async def main() -> None: + crawler = PlaywrightCrawler( + browser_pool=BrowserPool( + plugins=[ + PlaywrightBrowserPlugin( + browser_type='chromium', + browser_launch_options={ + 'headless': False, + 'channel': 'msedge', + 'slow_mo': 200, + }, + browser_new_context_options={ + 'color_scheme': 'dark', + 'extra_http_headers': { + 'Custom-Header': 'my-header', + 'Accept-Language': 'en', + }, + 'user_agent': 'My-User-Agent', + }, + ) + ] + ) + ) + + # ... + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/guides/code_examples/playwright_crawler/pre_navigation_hook_example.py b/website/versioned_docs/version-0.6/guides/code_examples/playwright_crawler/pre_navigation_hook_example.py new file mode 100644 index 0000000000..5a36456679 --- /dev/null +++ b/website/versioned_docs/version-0.6/guides/code_examples/playwright_crawler/pre_navigation_hook_example.py @@ -0,0 +1,34 @@ +import asyncio + +from crawlee.crawlers import ( + PlaywrightCrawler, + PlaywrightCrawlingContext, + PlaywrightPreNavCrawlingContext, +) + + +async def main() -> None: + crawler = PlaywrightCrawler(max_requests_per_crawl=10) + + @crawler.router.default_handler + async def request_handler(context: PlaywrightCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + await context.enqueue_links() + + @crawler.pre_navigation_hook + async def log_navigation_url(context: PlaywrightPreNavCrawlingContext) -> None: + context.log.info(f'Navigating to {context.request.url} ...') + + # will set a timeout for all navigation methods + context.page.set_default_navigation_timeout(600_000) + + # will set the page size before you go to the target URL + await context.page.set_viewport_size({'width': 1280, 'height': 1024}) + + # Run the crawler with the initial list of URLs. + await crawler.run(['https://crawlee.dev']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/guides/code_examples/playwright_crawler_adaptive/handler.py b/website/versioned_docs/version-0.6/guides/code_examples/playwright_crawler_adaptive/handler.py new file mode 100644 index 0000000000..ad88e054cd --- /dev/null +++ b/website/versioned_docs/version-0.6/guides/code_examples/playwright_crawler_adaptive/handler.py @@ -0,0 +1,21 @@ +import asyncio +from datetime import timedelta + +from crawlee.crawlers import AdaptivePlaywrightCrawler, AdaptivePlaywrightCrawlingContext + + +async def main() -> None: + crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser() + + @crawler.router.default_handler + async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: + # Locate element h2 within 5 seconds + h2 = await context.query_selector_one('h2', timedelta(milliseconds=5000)) + # Do stuff with element found by the selector + context.log.info(h2) + + await crawler.run(['https://crawlee.dev/']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/guides/code_examples/playwright_crawler_adaptive/init_beautifulsoup.py b/website/versioned_docs/version-0.6/guides/code_examples/playwright_crawler_adaptive/init_beautifulsoup.py new file mode 100644 index 0000000000..c0008d3a29 --- /dev/null +++ b/website/versioned_docs/version-0.6/guides/code_examples/playwright_crawler_adaptive/init_beautifulsoup.py @@ -0,0 +1,21 @@ +import asyncio + +from crawlee.crawlers import AdaptivePlaywrightCrawler + + +async def main() -> None: + crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser( + # Arguments relevant only for PlaywrightCrawler + playwright_crawler_specific_kwargs={ + 'headless': False, + 'browser_type': 'chromium', + }, + # Common arguments relevant to all crawlers + max_crawl_depth=5, + ) + + # ... + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/guides/code_examples/playwright_crawler_adaptive/init_parsel.py b/website/versioned_docs/version-0.6/guides/code_examples/playwright_crawler_adaptive/init_parsel.py new file mode 100644 index 0000000000..c220d53be4 --- /dev/null +++ b/website/versioned_docs/version-0.6/guides/code_examples/playwright_crawler_adaptive/init_parsel.py @@ -0,0 +1,21 @@ +import asyncio + +from crawlee.crawlers import AdaptivePlaywrightCrawler + + +async def main() -> None: + crawler = AdaptivePlaywrightCrawler.with_parsel_static_parser( + # Arguments relevant only for PlaywrightCrawler + playwright_crawler_specific_kwargs={ + 'headless': False, + 'browser_type': 'chromium', + }, + # Common arguments relevant to all crawlers + max_crawl_depth=5, + ) + + # ... + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/guides/code_examples/playwright_crawler_adaptive/init_prediction.py b/website/versioned_docs/version-0.6/guides/code_examples/playwright_crawler_adaptive/init_prediction.py new file mode 100644 index 0000000000..b07b1592ae --- /dev/null +++ b/website/versioned_docs/version-0.6/guides/code_examples/playwright_crawler_adaptive/init_prediction.py @@ -0,0 +1,70 @@ +import asyncio + +from crawlee import Request +from crawlee._types import RequestHandlerRunResult +from crawlee.crawlers import ( + AdaptivePlaywrightCrawler, + RenderingType, + RenderingTypePrediction, + RenderingTypePredictor, +) + + +class CustomRenderingTypePredictor(RenderingTypePredictor): + def __init__(self) -> None: + self._learning_data = list[tuple[Request, RenderingType]]() + + def predict(self, request: Request) -> RenderingTypePrediction: + # Some custom logic that produces some `RenderingTypePrediction` + # based on the `request` input. + rendering_type: RenderingType = ( + 'static' if 'abc' in request.url else 'client only' + ) + + return RenderingTypePrediction( + # Recommends `static` rendering type -> HTTP-based sub crawler will be used. + rendering_type=rendering_type, + # Recommends that both sub crawlers should run with 20% chance. When both sub + # crawlers are running, the predictor can compare results and learn. + # High number means that predictor is not very confident about the + # `rendering_type`, low number means that predictor is very confident. + detection_probability_recommendation=0.2, + ) + + def store_result(self, request: Request, rendering_type: RenderingType) -> None: + # This function allows predictor to store new learning data and retrain itself + # if needed. `request` is input for prediction and `rendering_type` is the correct + # prediction. + self._learning_data.append((request, rendering_type)) + # retrain + + +def result_checker(result: RequestHandlerRunResult) -> bool: + # Some function that inspects produced `result` and returns `True` if the result + # is correct. + return bool(result) # Check something on result + + +def result_comparator( + result_1: RequestHandlerRunResult, result_2: RequestHandlerRunResult +) -> bool: + # Some function that inspects two results and returns `True` if they are + # considered equivalent. It is used when comparing results produced by HTTP-based + # sub crawler and playwright based sub crawler. + return ( + result_1.push_data_calls == result_2.push_data_calls + ) # For example compare `push_data` calls. + + +async def main() -> None: + crawler = AdaptivePlaywrightCrawler.with_parsel_static_parser( + rendering_type_predictor=CustomRenderingTypePredictor(), + result_checker=result_checker, + result_comparator=result_comparator, + ) + + # ... + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/guides/code_examples/playwright_crawler_adaptive/pre_nav_hooks.py b/website/versioned_docs/version-0.6/guides/code_examples/playwright_crawler_adaptive/pre_nav_hooks.py new file mode 100644 index 0000000000..bd95bd9f8b --- /dev/null +++ b/website/versioned_docs/version-0.6/guides/code_examples/playwright_crawler_adaptive/pre_nav_hooks.py @@ -0,0 +1,39 @@ +import asyncio + +from playwright.async_api import Route + +from crawlee.crawlers import ( + AdaptivePlaywrightCrawler, + AdaptivePlaywrightPreNavCrawlingContext, +) + + +async def main() -> None: + crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser() + + @crawler.pre_navigation_hook + async def hook(context: AdaptivePlaywrightPreNavCrawlingContext) -> None: + """Hook executed both in static sub crawler and playwright sub crawler. + + Trying to access `context.page` in this hook would raise `AdaptiveContextError` + for pages crawled without playwright. + """ + context.log.info(f'pre navigation hook for: {context.request.url}') + + @crawler.pre_navigation_hook(playwright_only=True) + async def hook_playwright(context: AdaptivePlaywrightPreNavCrawlingContext) -> None: + """Hook executed only in playwright sub crawler.""" + + async def some_routing_function(route: Route) -> None: + await route.continue_() + + await context.page.route('*/**', some_routing_function) + context.log.info( + f'Playwright only pre navigation hook for: {context.request.url}' + ) + + await crawler.run(['https://crawlee.dev/']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/guides/code_examples/proxy_management/inspecting_bs_example.py b/website/versioned_docs/version-0.6/guides/code_examples/proxy_management/inspecting_bs_example.py new file mode 100644 index 0000000000..b6035097d6 --- /dev/null +++ b/website/versioned_docs/version-0.6/guides/code_examples/proxy_management/inspecting_bs_example.py @@ -0,0 +1,28 @@ +import asyncio + +from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext +from crawlee.proxy_configuration import ProxyConfiguration + + +async def main() -> None: + # Create a ProxyConfiguration object and pass it to the crawler. + proxy_configuration = ProxyConfiguration( + proxy_urls=[ + 'http://proxy-1.com/', + 'http://proxy-2.com/', + ] + ) + crawler = BeautifulSoupCrawler(proxy_configuration=proxy_configuration) + + # Define the default request handler, which will be called for every request. + @crawler.router.default_handler + async def default_handler(context: BeautifulSoupCrawlingContext) -> None: + # Log the proxy used for the current request. + context.log.info(f'Proxy for the current request: {context.proxy_info}') + + # Run the crawler with the initial list of requests. + await crawler.run(['https://crawlee.dev/']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/guides/code_examples/proxy_management/inspecting_pw_example.py b/website/versioned_docs/version-0.6/guides/code_examples/proxy_management/inspecting_pw_example.py new file mode 100644 index 0000000000..e193972399 --- /dev/null +++ b/website/versioned_docs/version-0.6/guides/code_examples/proxy_management/inspecting_pw_example.py @@ -0,0 +1,28 @@ +import asyncio + +from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext +from crawlee.proxy_configuration import ProxyConfiguration + + +async def main() -> None: + # Create a ProxyConfiguration object and pass it to the crawler. + proxy_configuration = ProxyConfiguration( + proxy_urls=[ + 'http://proxy-1.com/', + 'http://proxy-2.com/', + ] + ) + crawler = PlaywrightCrawler(proxy_configuration=proxy_configuration) + + # Define the default request handler, which will be called for every request. + @crawler.router.default_handler + async def default_handler(context: PlaywrightCrawlingContext) -> None: + # Log the proxy used for the current request. + context.log.info(f'Proxy for the current request: {context.proxy_info}') + + # Run the crawler with the initial list of requests. + await crawler.run(['https://crawlee.dev/']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/guides/code_examples/proxy_management/integration_bs_example.py b/website/versioned_docs/version-0.6/guides/code_examples/proxy_management/integration_bs_example.py new file mode 100644 index 0000000000..63a2e703e3 --- /dev/null +++ b/website/versioned_docs/version-0.6/guides/code_examples/proxy_management/integration_bs_example.py @@ -0,0 +1,32 @@ +import asyncio + +from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext +from crawlee.proxy_configuration import ProxyConfiguration + + +async def main() -> None: + # Create a ProxyConfiguration object and pass it to the crawler. + proxy_configuration = ProxyConfiguration( + proxy_urls=[ + 'http://proxy-1.com/', + 'http://proxy-2.com/', + ] + ) + crawler = BeautifulSoupCrawler(proxy_configuration=proxy_configuration) + + # Define the default request handler, which will be called for every request. + @crawler.router.default_handler + async def default_handler(context: BeautifulSoupCrawlingContext) -> None: + # Extract data from the page. + data = { + 'url': context.request.url, + 'title': context.soup.title.string if context.soup.title else None, + } + context.log.info(f'Extracted data: {data}') + + # Run the crawler with the initial list of requests. + await crawler.run(['https://crawlee.dev/']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/guides/code_examples/proxy_management/integration_pw_example.py b/website/versioned_docs/version-0.6/guides/code_examples/proxy_management/integration_pw_example.py new file mode 100644 index 0000000000..e8e0ea8821 --- /dev/null +++ b/website/versioned_docs/version-0.6/guides/code_examples/proxy_management/integration_pw_example.py @@ -0,0 +1,32 @@ +import asyncio + +from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext +from crawlee.proxy_configuration import ProxyConfiguration + + +async def main() -> None: + # Create a ProxyConfiguration object and pass it to the crawler. + proxy_configuration = ProxyConfiguration( + proxy_urls=[ + 'http://proxy-1.com/', + 'http://proxy-2.com/', + ] + ) + crawler = PlaywrightCrawler(proxy_configuration=proxy_configuration) + + # Define the default request handler, which will be called for every request. + @crawler.router.default_handler + async def default_handler(context: PlaywrightCrawlingContext) -> None: + # Extract data from the page. + data = { + 'url': context.request.url, + 'title': await context.page.title(), + } + context.log.info(f'Extracted data: {data}') + + # Run the crawler with the initial list of requests. + await crawler.run(['https://crawlee.dev/']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/guides/code_examples/proxy_management/quick_start_example.py b/website/versioned_docs/version-0.6/guides/code_examples/proxy_management/quick_start_example.py new file mode 100644 index 0000000000..3b43a48312 --- /dev/null +++ b/website/versioned_docs/version-0.6/guides/code_examples/proxy_management/quick_start_example.py @@ -0,0 +1,21 @@ +import asyncio + +from crawlee.proxy_configuration import ProxyConfiguration + + +async def main() -> None: + proxy_configuration = ProxyConfiguration( + proxy_urls=[ + 'http://proxy-1.com/', + 'http://proxy-2.com/', + ] + ) + + # The proxy URLs are rotated in a round-robin. + proxy_url_1 = await proxy_configuration.new_url() # http://proxy-1.com/ + proxy_url_2 = await proxy_configuration.new_url() # http://proxy-2.com/ + proxy_url_3 = await proxy_configuration.new_url() # http://proxy-1.com/ + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/guides/code_examples/proxy_management/session_bs_example.py b/website/versioned_docs/version-0.6/guides/code_examples/proxy_management/session_bs_example.py new file mode 100644 index 0000000000..1243b0e488 --- /dev/null +++ b/website/versioned_docs/version-0.6/guides/code_examples/proxy_management/session_bs_example.py @@ -0,0 +1,24 @@ +import asyncio + +from crawlee.crawlers import BeautifulSoupCrawler +from crawlee.proxy_configuration import ProxyConfiguration + + +async def main() -> None: + # Create a ProxyConfiguration object and pass it to the crawler. + proxy_configuration = ProxyConfiguration( + proxy_urls=[ + 'http://proxy-1.com/', + 'http://proxy-2.com/', + ] + ) + crawler = BeautifulSoupCrawler( + proxy_configuration=proxy_configuration, + use_session_pool=True, + ) + + # ... + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/guides/code_examples/proxy_management/session_pw_example.py b/website/versioned_docs/version-0.6/guides/code_examples/proxy_management/session_pw_example.py new file mode 100644 index 0000000000..68309bda59 --- /dev/null +++ b/website/versioned_docs/version-0.6/guides/code_examples/proxy_management/session_pw_example.py @@ -0,0 +1,24 @@ +import asyncio + +from crawlee.crawlers import PlaywrightCrawler +from crawlee.proxy_configuration import ProxyConfiguration + + +async def main() -> None: + # Create a ProxyConfiguration object and pass it to the crawler. + proxy_configuration = ProxyConfiguration( + proxy_urls=[ + 'http://proxy-1.com/', + 'http://proxy-2.com/', + ] + ) + crawler = PlaywrightCrawler( + proxy_configuration=proxy_configuration, + use_session_pool=True, + ) + + # ... + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/guides/code_examples/proxy_management/tiers_bs_example.py b/website/versioned_docs/version-0.6/guides/code_examples/proxy_management/tiers_bs_example.py new file mode 100644 index 0000000000..37f69e6419 --- /dev/null +++ b/website/versioned_docs/version-0.6/guides/code_examples/proxy_management/tiers_bs_example.py @@ -0,0 +1,39 @@ +import asyncio + +from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext +from crawlee.proxy_configuration import ProxyConfiguration + + +async def main() -> None: + # Create a ProxyConfiguration object and pass it to the crawler. + proxy_configuration = ProxyConfiguration( + tiered_proxy_urls=[ + # No proxy tier. + # Optional in case you do not want to use any proxy on lowest tier. + [None], + # lower tier, cheaper, preferred as long as they work + [ + 'http://cheap-datacenter-proxy-1.com/', + 'http://cheap-datacenter-proxy-2.com/', + ], + # higher tier, more expensive, used as a fallback + [ + 'http://expensive-residential-proxy-1.com/', + 'http://expensive-residential-proxy-2.com/', + ], + ] + ) + crawler = BeautifulSoupCrawler(proxy_configuration=proxy_configuration) + + # Define the default request handler, which will be called for every request. + @crawler.router.default_handler + async def default_handler(context: BeautifulSoupCrawlingContext) -> None: + # Log the proxy used for the current request. + context.log.info(f'Proxy for the current request: {context.proxy_info}') + + # Run the crawler with the initial list of requests. + await crawler.run(['https://crawlee.dev/']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/guides/code_examples/proxy_management/tiers_pw_example.py b/website/versioned_docs/version-0.6/guides/code_examples/proxy_management/tiers_pw_example.py new file mode 100644 index 0000000000..2dcb5ad3bd --- /dev/null +++ b/website/versioned_docs/version-0.6/guides/code_examples/proxy_management/tiers_pw_example.py @@ -0,0 +1,39 @@ +import asyncio + +from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext +from crawlee.proxy_configuration import ProxyConfiguration + + +async def main() -> None: + # Create a ProxyConfiguration object and pass it to the crawler. + proxy_configuration = ProxyConfiguration( + tiered_proxy_urls=[ + # No proxy tier. + # Optional in case you do not want to use any proxy on lowest tier. + [None], + # lower tier, cheaper, preferred as long as they work + [ + 'http://cheap-datacenter-proxy-1.com/', + 'http://cheap-datacenter-proxy-2.com/', + ], + # higher tier, more expensive, used as a fallback + [ + 'http://expensive-residential-proxy-1.com/', + 'http://expensive-residential-proxy-2.com/', + ], + ] + ) + crawler = PlaywrightCrawler(proxy_configuration=proxy_configuration) + + # Define the default request handler, which will be called for every request. + @crawler.router.default_handler + async def default_handler(context: PlaywrightCrawlingContext) -> None: + # Log the proxy used for the current request. + context.log.info(f'Proxy for the current request: {context.proxy_info}') + + # Run the crawler with the initial list of requests. + await crawler.run(['https://crawlee.dev/']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/guides/code_examples/request_loaders/rl_basic_example.py b/website/versioned_docs/version-0.6/guides/code_examples/request_loaders/rl_basic_example.py new file mode 100644 index 0000000000..abe4d55584 --- /dev/null +++ b/website/versioned_docs/version-0.6/guides/code_examples/request_loaders/rl_basic_example.py @@ -0,0 +1,27 @@ +import asyncio + +from crawlee.request_loaders import RequestList + + +async def main() -> None: + # Open the request list, if it does not exist, it will be created. + # Leave name empty to use the default request list. + request_list = RequestList( + name='my-request-list', + requests=[ + 'https://apify.com/', + 'https://crawlee.dev/', + 'https://crawlee.dev/python/', + ], + ) + + # Fetch and process requests from the queue. + while request := await request_list.fetch_next_request(): + # Do something with it... + + # And mark it as handled. + await request_list.mark_request_as_handled(request) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/guides/code_examples/request_loaders/tandem_example.py b/website/versioned_docs/version-0.6/guides/code_examples/request_loaders/tandem_example.py new file mode 100644 index 0000000000..b0e83138ca --- /dev/null +++ b/website/versioned_docs/version-0.6/guides/code_examples/request_loaders/tandem_example.py @@ -0,0 +1,27 @@ +import asyncio + +from crawlee.crawlers import ParselCrawler, ParselCrawlingContext +from crawlee.request_loaders import RequestList + + +async def main() -> None: + # Create a static request list. + request_list = RequestList(['https://crawlee.dev', 'https://apify.com']) + + # Convert the request list to a request manager using the to_tandem method. + # It is a tandem with the default request queue. + request_manager = await request_list.to_tandem() + + # Create a crawler and pass the request manager to it. + crawler = ParselCrawler(request_manager=request_manager) + + @crawler.router.default_handler + async def handler(context: ParselCrawlingContext) -> None: + # New links will be enqueued directly to the queue. + await context.enqueue_links() + + await crawler.run() + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/guides/code_examples/request_loaders/tandem_example_explicit.py b/website/versioned_docs/version-0.6/guides/code_examples/request_loaders/tandem_example_explicit.py new file mode 100644 index 0000000000..17ba20c392 --- /dev/null +++ b/website/versioned_docs/version-0.6/guides/code_examples/request_loaders/tandem_example_explicit.py @@ -0,0 +1,30 @@ +import asyncio + +from crawlee.crawlers import ParselCrawler, ParselCrawlingContext +from crawlee.request_loaders import RequestList, RequestManagerTandem +from crawlee.storages import RequestQueue + + +async def main() -> None: + # Create a static request list. + request_list = RequestList(['https://crawlee.dev', 'https://apify.com']) + + # Open the default request queue. + request_queue = await RequestQueue.open() + + # And combine them together to a sinhle request manager. + request_manager = RequestManagerTandem(request_list, request_queue) + + # Create a crawler and pass the request manager to it. + crawler = ParselCrawler(request_manager=request_manager) + + @crawler.router.default_handler + async def handler(context: ParselCrawlingContext) -> None: + # New links will be enqueued directly to the queue. + await context.enqueue_links() + + await crawler.run() + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/guides/code_examples/running_in_web_server/__init__.py b/website/versioned_docs/version-0.6/guides/code_examples/running_in_web_server/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/website/versioned_docs/version-0.6/guides/code_examples/running_in_web_server/crawler.py b/website/versioned_docs/version-0.6/guides/code_examples/running_in_web_server/crawler.py new file mode 100644 index 0000000000..37c6671856 --- /dev/null +++ b/website/versioned_docs/version-0.6/guides/code_examples/running_in_web_server/crawler.py @@ -0,0 +1,54 @@ +import asyncio +from collections.abc import AsyncIterator +from contextlib import asynccontextmanager +from typing import TypedDict + +from fastapi import FastAPI + +from crawlee.crawlers import ParselCrawler, ParselCrawlingContext + + +class State(TypedDict): + """State available in the app.""" + + crawler: ParselCrawler + requests_to_results: dict[str, asyncio.Future[dict[str, str]]] + + +@asynccontextmanager +async def lifespan(app: FastAPI) -> AsyncIterator[State]: + # Start up code that runs once when the app starts + + # Results will be stored in this dictionary + requests_to_results = dict[str, asyncio.Future[dict[str, str]]]() + + crawler = ParselCrawler( + # Keep the crawler alive even when there are no more requests to process now. + # This makes the crawler wait for more requests to be added later. + keep_alive=True + ) + + # Define the default request handler, which will be called for every request. + @crawler.router.default_handler + async def request_handler(context: ParselCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + title = context.selector.xpath('//title/text()').get() or '' + + # Extract data from the page and save it to the result dictionary. + requests_to_results[context.request.unique_key].set_result( + { + 'title': title, + } + ) + + # Start the crawler without awaiting it to finish + crawler.log.info(f'Starting crawler for the {app.title}') + run_task = asyncio.create_task(crawler.run([])) + + # Make the crawler and the result dictionary available in the app state + yield {'crawler': crawler, 'requests_to_results': requests_to_results} + + # Cleanup code that runs once when the app shuts down + crawler.stop() + # Wait for the crawler to finish + await run_task diff --git a/website/versioned_docs/version-0.6/guides/code_examples/running_in_web_server/server.py b/website/versioned_docs/version-0.6/guides/code_examples/running_in_web_server/server.py new file mode 100644 index 0000000000..64e192af37 --- /dev/null +++ b/website/versioned_docs/version-0.6/guides/code_examples/running_in_web_server/server.py @@ -0,0 +1,58 @@ +from __future__ import annotations + +import asyncio +from uuid import uuid4 + +from fastapi import FastAPI +from starlette.requests import Request +from starlette.responses import HTMLResponse + +import crawlee + +from .crawler import lifespan + +app = FastAPI(lifespan=lifespan, title='Crawler app') + + +@app.get('/', response_class=HTMLResponse) +def index() -> str: + return """ + + + +

Scraper server

+

To scrape some page, visit "scrape" endpoint with url parameter. + For example: + + /scrape?url=https://www.example.com + +

+ + +""" + + +@app.get('/scrape') +async def scrape_url(request: Request, url: str | None = None) -> dict: + if not url: + return {'url': 'missing', 'scrape result': 'no results'} + + # Generate random unique key for the request + unique_key = str(uuid4()) + + # Set the result future in the result dictionary so that it can be awaited + request.state.requests_to_results[unique_key] = asyncio.Future[dict[str, str]]() + + # Add the request to the crawler queue + await request.state.crawler.add_requests( + [crawlee.Request.from_url(url, unique_key=unique_key)] + ) + + # Wait for the result future to be finished + result = await request.state.requests_to_results[unique_key] + + # Clean the result from the result dictionary to free up memory + request.state.requests_to_results.pop(unique_key) + + # Return the result + return {'url': url, 'scrape result': result} diff --git a/website/versioned_docs/version-0.6/guides/code_examples/scaling_crawlers/max_tasks_per_minute_example.py b/website/versioned_docs/version-0.6/guides/code_examples/scaling_crawlers/max_tasks_per_minute_example.py new file mode 100644 index 0000000000..cbc1130bc7 --- /dev/null +++ b/website/versioned_docs/version-0.6/guides/code_examples/scaling_crawlers/max_tasks_per_minute_example.py @@ -0,0 +1,25 @@ +import asyncio + +from crawlee import ConcurrencySettings +from crawlee.crawlers import BeautifulSoupCrawler + + +async def main() -> None: + concurrency_settings = ConcurrencySettings( + # Set the maximum number of concurrent requests the crawler can run to 100. + max_concurrency=100, + # Limit the total number of requests to 10 per minute to avoid overwhelming + # the target website. + max_tasks_per_minute=10, + ) + + crawler = BeautifulSoupCrawler( + # Apply the defined concurrency settings to the crawler. + concurrency_settings=concurrency_settings, + ) + + # ... + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/guides/code_examples/scaling_crawlers/min_and_max_concurrency_example.py b/website/versioned_docs/version-0.6/guides/code_examples/scaling_crawlers/min_and_max_concurrency_example.py new file mode 100644 index 0000000000..4d491446d0 --- /dev/null +++ b/website/versioned_docs/version-0.6/guides/code_examples/scaling_crawlers/min_and_max_concurrency_example.py @@ -0,0 +1,27 @@ +import asyncio + +from crawlee import ConcurrencySettings +from crawlee.crawlers import BeautifulSoupCrawler + + +async def main() -> None: + concurrency_settings = ConcurrencySettings( + # Start with 8 concurrent tasks, as long as resources are available. + desired_concurrency=8, + # Maintain a minimum of 5 concurrent tasks to ensure steady crawling. + min_concurrency=5, + # Limit the maximum number of concurrent tasks to 10 to prevent + # overloading the system. + max_concurrency=10, + ) + + crawler = BeautifulSoupCrawler( + # Use the configured concurrency settings for the crawler. + concurrency_settings=concurrency_settings, + ) + + # ... + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/guides/code_examples/session_management/multi_sessions_http.py b/website/versioned_docs/version-0.6/guides/code_examples/session_management/multi_sessions_http.py new file mode 100644 index 0000000000..aba4d5e9d0 --- /dev/null +++ b/website/versioned_docs/version-0.6/guides/code_examples/session_management/multi_sessions_http.py @@ -0,0 +1,85 @@ +import asyncio +from datetime import timedelta +from itertools import count +from typing import Callable + +from crawlee import ConcurrencySettings, Request +from crawlee.crawlers import BasicCrawlingContext, HttpCrawler, HttpCrawlingContext +from crawlee.errors import RequestCollisionError +from crawlee.sessions import Session, SessionPool + + +# Define a function for creating sessions with simple logic for unique `id` generation. +# This is necessary if you need to specify a particular session for the first request, +# for example during authentication +def create_session_function() -> Callable[[], Session]: + counter = count() + + def create_session() -> Session: + return Session( + id=str(next(counter)), + max_usage_count=999_999, + max_age=timedelta(hours=999_999), + max_error_score=100, + blocked_status_codes=[403], + ) + + return create_session + + +async def main() -> None: + crawler = HttpCrawler( + # Adjust request limits according to your pool size + concurrency_settings=ConcurrencySettings(max_tasks_per_minute=500), + # Requests are bound to specific sessions, no rotation needed + max_session_rotations=0, + session_pool=SessionPool( + max_pool_size=10, create_session_function=create_session_function() + ), + ) + + @crawler.router.default_handler + async def basic_handler(context: HttpCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url}') + + # Initialize the session and bind the next request to this session if needed + @crawler.router.handler(label='session_init') + async def session_init(context: HttpCrawlingContext) -> None: + next_requests = [] + if context.session: + context.log.info(f'Init session {context.session.id}') + next_request = Request.from_url( + 'https://placeholder.dev', session_id=context.session.id + ) + next_requests.append(next_request) + + await context.add_requests(next_requests) + + # Handle errors when a session is blocked and no longer available in the pool + # when attempting to execute requests bound to it + @crawler.failed_request_handler + async def error_processing(context: BasicCrawlingContext, error: Exception) -> None: + if isinstance(error, RequestCollisionError) and context.session: + context.log.error( + f'Request {context.request.url} failed, because the bound ' + 'session is unavailable' + ) + + # Create a pool of requests bound to their respective sessions + # Use `always_enqueue=True` if session initialization happens on a non-unique address, + # such as the site's main page + init_requests = [ + Request.from_url( + 'https://example.org/', + label='session_init', + session_id=str(session_id), + use_extended_unique_key=True, + ) + for session_id in range(1, 11) + ] + + await crawler.run(init_requests) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/guides/code_examples/session_management/one_session_http.py b/website/versioned_docs/version-0.6/guides/code_examples/session_management/one_session_http.py new file mode 100644 index 0000000000..28cec44b63 --- /dev/null +++ b/website/versioned_docs/version-0.6/guides/code_examples/session_management/one_session_http.py @@ -0,0 +1,56 @@ +import asyncio +from datetime import timedelta + +from crawlee import ConcurrencySettings, Request +from crawlee.crawlers import BasicCrawlingContext, HttpCrawler, HttpCrawlingContext +from crawlee.errors import SessionError +from crawlee.sessions import SessionPool + + +async def main() -> None: + crawler = HttpCrawler( + # Limit requests per minute to reduce the chance of being blocked + concurrency_settings=ConcurrencySettings(max_tasks_per_minute=50), + # Disable session rotation + max_session_rotations=0, + session_pool=SessionPool( + # Only one session in the pool + max_pool_size=1, + create_session_settings={ + # High value for session usage limit + 'max_usage_count': 999_999, + # High value for session lifetime + 'max_age': timedelta(hours=999_999), + # High score allows the session to encounter more errors + # before crawlee decides the session is blocked + # Make sure you know how to handle these errors + 'max_error_score': 100, + # 403 status usually indicates you're already blocked + 'blocked_status_codes': [403], + }, + ), + ) + + # Basic request handling logic + @crawler.router.default_handler + async def basic_handler(context: HttpCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url}') + + # Handler for session initialization (authentication, initial cookies, etc.) + @crawler.router.handler(label='session_init') + async def session_init(context: HttpCrawlingContext) -> None: + if context.session: + context.log.info(f'Init session {context.session.id}') + + # Monitor if our session gets blocked and explicitly stop the crawler + @crawler.error_handler + async def error_processing(context: BasicCrawlingContext, error: Exception) -> None: + if isinstance(error, SessionError) and context.session: + context.log.info(f'Session {context.session.id} blocked') + crawler.stop() + + await crawler.run([Request.from_url('https://example.org/', label='session_init')]) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/guides/code_examples/session_management/sm_basic.py b/website/versioned_docs/version-0.6/guides/code_examples/session_management/sm_basic.py new file mode 100644 index 0000000000..958ad5a665 --- /dev/null +++ b/website/versioned_docs/version-0.6/guides/code_examples/session_management/sm_basic.py @@ -0,0 +1,48 @@ +import asyncio +import re + +from crawlee.crawlers import BasicCrawler, BasicCrawlingContext +from crawlee.proxy_configuration import ProxyConfiguration +from crawlee.sessions import SessionPool + + +async def main() -> None: + # To use the proxy IP session rotation logic, you must turn the proxy usage on. + proxy_configuration = ProxyConfiguration( + # options + ) + + # Initialize crawler with a custom SessionPool configuration + # to manage concurrent sessions and proxy rotation + crawler = BasicCrawler( + proxy_configuration=proxy_configuration, + # Activates the Session pool (default is true). + use_session_pool=True, + # Overrides default Session pool configuration. + session_pool=SessionPool(max_pool_size=100), + ) + + # Define the default request handler that manages session states + @crawler.router.default_handler + async def default_handler(context: BasicCrawlingContext) -> None: + # Send request, BasicCrawler automatically selects a session from the pool + # and sets a proxy for it. You can check it with `context.session` + # and `context.proxy_info`. + response = await context.send_request(context.request.url) + + page_content = response.read().decode() + title_match = re.search(r'(.*?)', page_content) + + if context.session and (title := title_match.group(1) if title_match else None): + if title == 'Blocked': + context.session.retire() + elif title == 'Not sure if blocked, might also be a connection error': + context.session.mark_bad() + else: + context.session.mark_good() # BasicCrawler handles this automatically. + + await crawler.run(['https://crawlee.dev/']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/guides/code_examples/session_management/sm_beautifulsoup.py b/website/versioned_docs/version-0.6/guides/code_examples/session_management/sm_beautifulsoup.py new file mode 100644 index 0000000000..a54fd8425f --- /dev/null +++ b/website/versioned_docs/version-0.6/guides/code_examples/session_management/sm_beautifulsoup.py @@ -0,0 +1,42 @@ +import asyncio + +from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext +from crawlee.proxy_configuration import ProxyConfiguration +from crawlee.sessions import SessionPool + + +async def main() -> None: + # To use the proxy IP session rotation logic, you must turn the proxy usage on. + proxy_configuration = ProxyConfiguration( + # options + ) + + # Initialize crawler with a custom SessionPool configuration + # to manage concurrent sessions and proxy rotation + crawler = BeautifulSoupCrawler( + proxy_configuration=proxy_configuration, + # Activates the Session pool (default is true). + use_session_pool=True, + # Overrides default Session pool configuration. + session_pool=SessionPool(max_pool_size=100), + ) + + # Define the default request handler that manages session states + # based on the response content and potential blocking + @crawler.router.default_handler + async def default_handler(context: BeautifulSoupCrawlingContext) -> None: + title = context.soup.title.get_text() if context.soup.title else None + + if context.session: + if title == 'Blocked': + context.session.retire() + elif title == 'Not sure if blocked, might also be a connection error': + context.session.mark_bad() + else: + context.session.mark_good() # BasicCrawler handles this automatically. + + await crawler.run(['https://crawlee.dev/']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/guides/code_examples/session_management/sm_http.py b/website/versioned_docs/version-0.6/guides/code_examples/session_management/sm_http.py new file mode 100644 index 0000000000..cd12d04bdf --- /dev/null +++ b/website/versioned_docs/version-0.6/guides/code_examples/session_management/sm_http.py @@ -0,0 +1,44 @@ +import asyncio +import re + +from crawlee.crawlers import HttpCrawler, HttpCrawlingContext +from crawlee.proxy_configuration import ProxyConfiguration +from crawlee.sessions import SessionPool + + +async def main() -> None: + # To use the proxy IP session rotation logic, you must turn the proxy usage on. + proxy_configuration = ProxyConfiguration( + # options + ) + + # Initialize crawler with a custom SessionPool configuration + # to manage concurrent sessions and proxy rotation + crawler = HttpCrawler( + proxy_configuration=proxy_configuration, + # Activates the Session pool (default is true). + use_session_pool=True, + # Overrides default Session pool configuration. + session_pool=SessionPool(max_pool_size=100), + ) + + # Define the default request handler that manages session states + # based on the response content and potential blocking + @crawler.router.default_handler + async def default_handler(context: HttpCrawlingContext) -> None: + page_content = context.http_response.read().decode() + title_match = re.search(r'(.*?)', page_content) + + if context.session and (title := title_match.group(1) if title_match else None): + if title == 'Blocked': + context.session.retire() + elif title == 'Not sure if blocked, might also be a connection error': + context.session.mark_bad() + else: + context.session.mark_good() # BasicCrawler handles this automatically. + + await crawler.run(['https://crawlee.dev/']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/guides/code_examples/session_management/sm_parsel.py b/website/versioned_docs/version-0.6/guides/code_examples/session_management/sm_parsel.py new file mode 100644 index 0000000000..66752a63c3 --- /dev/null +++ b/website/versioned_docs/version-0.6/guides/code_examples/session_management/sm_parsel.py @@ -0,0 +1,42 @@ +import asyncio + +from crawlee.crawlers import ParselCrawler, ParselCrawlingContext +from crawlee.proxy_configuration import ProxyConfiguration +from crawlee.sessions import SessionPool + + +async def main() -> None: + # To use the proxy IP session rotation logic, you must turn the proxy usage on. + proxy_configuration = ProxyConfiguration( + # options + ) + + # Initialize crawler with a custom SessionPool configuration + # to manage concurrent sessions and proxy rotation + crawler = ParselCrawler( + proxy_configuration=proxy_configuration, + # Activates the Session pool (default is true). + use_session_pool=True, + # Overrides default Session pool configuration. + session_pool=SessionPool(max_pool_size=100), + ) + + # Define the default request handler that manages session states + # based on the response content and potential blocking + @crawler.router.default_handler + async def default_handler(context: ParselCrawlingContext) -> None: + title = context.selector.css('title::text').get() + + if context.session: + if title == 'Blocked': + context.session.retire() + elif title == 'Not sure if blocked, might also be a connection error': + context.session.mark_bad() + else: + context.session.mark_good() # BasicCrawler handles this automatically. + + await crawler.run(['https://crawlee.dev/']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/guides/code_examples/session_management/sm_playwright.py b/website/versioned_docs/version-0.6/guides/code_examples/session_management/sm_playwright.py new file mode 100644 index 0000000000..46a4c4f096 --- /dev/null +++ b/website/versioned_docs/version-0.6/guides/code_examples/session_management/sm_playwright.py @@ -0,0 +1,42 @@ +import asyncio + +from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext +from crawlee.proxy_configuration import ProxyConfiguration +from crawlee.sessions import SessionPool + + +async def main() -> None: + # To use the proxy IP session rotation logic, you must turn the proxy usage on. + proxy_configuration = ProxyConfiguration( + # options + ) + + # Initialize crawler with a custom SessionPool configuration + # to manage concurrent sessions and proxy rotation + crawler = PlaywrightCrawler( + proxy_configuration=proxy_configuration, + # Activates the Session pool (default is true). + use_session_pool=True, + # Overrides default Session pool configuration. + session_pool=SessionPool(max_pool_size=100), + ) + + # Define the default request handler that manages session states + # based on the response content and potential blocking + @crawler.router.default_handler + async def default_handler(context: PlaywrightCrawlingContext) -> None: + title = await context.page.title() + + if context.session: + if title == 'Blocked': + context.session.retire() + elif title == 'Not sure if blocked, might also be a connection error': + context.session.mark_bad() + else: + context.session.mark_good() # BasicCrawler handles this automatically. + + await crawler.run(['https://crawlee.dev/']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/guides/code_examples/session_management/sm_standalone.py b/website/versioned_docs/version-0.6/guides/code_examples/session_management/sm_standalone.py new file mode 100644 index 0000000000..32989dc7e0 --- /dev/null +++ b/website/versioned_docs/version-0.6/guides/code_examples/session_management/sm_standalone.py @@ -0,0 +1,25 @@ +import asyncio + +from crawlee.sessions import SessionPool + + +async def main() -> None: + # Override the default Session pool configuration. + async with SessionPool( + max_pool_size=100, + create_session_settings={'max_usage_count': 10, 'blocked_status_codes': [403]}, + ) as session_pool: + session = await session_pool.get_session() + + # Increase the error_score. + session.mark_bad() + + # Throw away the session. + session.retire() + + # Lower the error_score and mark the session good. + session.mark_good() + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/guides/code_examples/storages/cleaning_do_not_purge_example.py b/website/versioned_docs/version-0.6/guides/code_examples/storages/cleaning_do_not_purge_example.py new file mode 100644 index 0000000000..6514863555 --- /dev/null +++ b/website/versioned_docs/version-0.6/guides/code_examples/storages/cleaning_do_not_purge_example.py @@ -0,0 +1,23 @@ +import asyncio + +from crawlee.configuration import Configuration +from crawlee.crawlers import HttpCrawler, HttpCrawlingContext + + +async def main() -> None: + # Set the purge_on_start field to False to avoid purging the storage on start. + # highlight-next-line + configuration = Configuration(purge_on_start=False) + + # Pass the configuration to the crawler. + crawler = HttpCrawler(configuration=configuration) + + @crawler.router.default_handler + async def request_handler(context: HttpCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + await crawler.run(['https://crawlee.dev/']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/guides/code_examples/storages/cleaning_purge_explicitly_example.py b/website/versioned_docs/version-0.6/guides/code_examples/storages/cleaning_purge_explicitly_example.py new file mode 100644 index 0000000000..15435da7bf --- /dev/null +++ b/website/versioned_docs/version-0.6/guides/code_examples/storages/cleaning_purge_explicitly_example.py @@ -0,0 +1,21 @@ +import asyncio + +from crawlee.crawlers import HttpCrawler +from crawlee.storage_clients import MemoryStorageClient + + +async def main() -> None: + storage_client = MemoryStorageClient.from_config() + + # Call the purge_on_start method to explicitly purge the storage. + # highlight-next-line + await storage_client.purge_on_start() + + # Pass the storage client to the crawler. + crawler = HttpCrawler(storage_client=storage_client) + + # ... + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/guides/code_examples/storages/dataset_basic_example.py b/website/versioned_docs/version-0.6/guides/code_examples/storages/dataset_basic_example.py new file mode 100644 index 0000000000..9b67f36eb0 --- /dev/null +++ b/website/versioned_docs/version-0.6/guides/code_examples/storages/dataset_basic_example.py @@ -0,0 +1,26 @@ +import asyncio + +from crawlee.storages import Dataset + + +async def main() -> None: + # Open the dataset, if it does not exist, it will be created. + # Leave name empty to use the default dataset. + dataset = await Dataset.open() + + # Push a single row of data. + await dataset.push_data({'foo': 'bar'}) + + # Push multiple rows of data (anything JSON-serializable can be pushed). + await dataset.push_data([{'foo': 'bar2', 'col2': 'val2'}, {'col3': 123}]) + + # Fetch all data from the dataset. + data = await dataset.get_data() + # Do something with it... + + # Remove the dataset. + await dataset.drop() + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/guides/code_examples/storages/dataset_with_crawler_example.py b/website/versioned_docs/version-0.6/guides/code_examples/storages/dataset_with_crawler_example.py new file mode 100644 index 0000000000..7e40824166 --- /dev/null +++ b/website/versioned_docs/version-0.6/guides/code_examples/storages/dataset_with_crawler_example.py @@ -0,0 +1,32 @@ +import asyncio + +from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext + + +async def main() -> None: + # Create a new crawler (it can be any subclass of BasicCrawler). + crawler = BeautifulSoupCrawler() + + # Define the default request handler, which will be called for every request. + @crawler.router.default_handler + async def request_handler(context: BeautifulSoupCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # Extract data from the page. + data = { + 'url': context.request.url, + 'title': context.soup.title.string if context.soup.title else None, + } + + # Push the extracted data to the (default) dataset. + await context.push_data(data) + + # Run the crawler with the initial URLs. + await crawler.run(['https://crawlee.dev']) + + # Export the dataset to a file. + await crawler.export_data(path='dataset.csv') + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/guides/code_examples/storages/dataset_with_crawler_explicit_example.py b/website/versioned_docs/version-0.6/guides/code_examples/storages/dataset_with_crawler_explicit_example.py new file mode 100644 index 0000000000..7c6a613b8f --- /dev/null +++ b/website/versioned_docs/version-0.6/guides/code_examples/storages/dataset_with_crawler_explicit_example.py @@ -0,0 +1,37 @@ +import asyncio + +from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext +from crawlee.storages import Dataset + + +async def main() -> None: + # Open the dataset, if it does not exist, it will be created. + # Leave name empty to use the default dataset. + dataset = await Dataset.open() + + # Create a new crawler (it can be any subclass of BasicCrawler). + crawler = BeautifulSoupCrawler() + + # Define the default request handler, which will be called for every request. + @crawler.router.default_handler + async def request_handler(context: BeautifulSoupCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # Extract data from the page. + data = { + 'url': context.request.url, + 'title': context.soup.title.string if context.soup.title else None, + } + + # Push the extracted data to the dataset. + await dataset.push_data(data) + + # Run the crawler with the initial URLs. + await crawler.run(['https://crawlee.dev']) + + # Export the dataset to the key-value store. + await dataset.export_to(key='dataset', content_type='csv') + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/guides/code_examples/storages/helper_add_requests_example.py b/website/versioned_docs/version-0.6/guides/code_examples/storages/helper_add_requests_example.py new file mode 100644 index 0000000000..15104cf6fc --- /dev/null +++ b/website/versioned_docs/version-0.6/guides/code_examples/storages/helper_add_requests_example.py @@ -0,0 +1,19 @@ +import asyncio + +from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext + + +async def main() -> None: + crawler = BeautifulSoupCrawler() + + @crawler.router.default_handler + async def request_handler(context: BeautifulSoupCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + # highlight-next-line + await context.add_requests(['https://apify.com/']) + + await crawler.run(['https://crawlee.dev/']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/guides/code_examples/storages/helper_enqueue_links_example.py b/website/versioned_docs/version-0.6/guides/code_examples/storages/helper_enqueue_links_example.py new file mode 100644 index 0000000000..6c7392bc3b --- /dev/null +++ b/website/versioned_docs/version-0.6/guides/code_examples/storages/helper_enqueue_links_example.py @@ -0,0 +1,19 @@ +import asyncio + +from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext + + +async def main() -> None: + crawler = BeautifulSoupCrawler() + + @crawler.router.default_handler + async def request_handler(context: BeautifulSoupCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + # highlight-next-line + await context.enqueue_links() + + await crawler.run(['https://crawlee.dev/']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/guides/code_examples/storages/kvs_basic_example.py b/website/versioned_docs/version-0.6/guides/code_examples/storages/kvs_basic_example.py new file mode 100644 index 0000000000..7821fa75de --- /dev/null +++ b/website/versioned_docs/version-0.6/guides/code_examples/storages/kvs_basic_example.py @@ -0,0 +1,26 @@ +import asyncio + +from crawlee.storages import KeyValueStore + + +async def main() -> None: + # Open the key-value store, if it does not exist, it will be created. + # Leave name empty to use the default KVS. + kvs = await KeyValueStore.open() + + # Set a value associated with 'some-key'. + await kvs.set_value(key='some-key', value={'foo': 'bar'}) + + # Get the value associated with 'some-key'. + value = kvs.get_value('some-key') + # Do something with it... + + # Delete the value associated with 'some-key' by setting it to None. + await kvs.set_value(key='some-key', value=None) + + # Remove the key-value store. + await kvs.drop() + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/guides/code_examples/storages/kvs_with_crawler_example.py b/website/versioned_docs/version-0.6/guides/code_examples/storages/kvs_with_crawler_example.py new file mode 100644 index 0000000000..732ee41f76 --- /dev/null +++ b/website/versioned_docs/version-0.6/guides/code_examples/storages/kvs_with_crawler_example.py @@ -0,0 +1,35 @@ +import asyncio + +from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext + + +async def main() -> None: + # Create a new Playwright crawler. + crawler = PlaywrightCrawler() + + # Define the default request handler, which will be called for every request. + @crawler.router.default_handler + async def request_handler(context: PlaywrightCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # Capture the screenshot of the page using Playwright's API. + screenshot = await context.page.screenshot() + name = context.request.url.split('/')[-1] + + # Get the key-value store from the context. # If it does not exist, + # it will be created. Leave name empty to use the default KVS. + kvs = await context.get_key_value_store() + + # Store the screenshot in the key-value store. + await kvs.set_value( + key=f'screenshot-{name}', + value=screenshot, + content_type='image/png', + ) + + # Run the crawler with the initial URLs. + await crawler.run(['https://crawlee.dev']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/guides/code_examples/storages/kvs_with_crawler_explicit_example.py b/website/versioned_docs/version-0.6/guides/code_examples/storages/kvs_with_crawler_explicit_example.py new file mode 100644 index 0000000000..66a921bd04 --- /dev/null +++ b/website/versioned_docs/version-0.6/guides/code_examples/storages/kvs_with_crawler_explicit_example.py @@ -0,0 +1,36 @@ +import asyncio + +from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext +from crawlee.storages import KeyValueStore + + +async def main() -> None: + # Open the key-value store, if it does not exist, it will be created. + # Leave name empty to use the default KVS. + kvs = await KeyValueStore.open() + + # Create a new Playwright crawler. + crawler = PlaywrightCrawler() + + # Define the default request handler, which will be called for every request. + @crawler.router.default_handler + async def request_handler(context: PlaywrightCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # Capture the screenshot of the page using Playwright's API. + screenshot = await context.page.screenshot() + name = context.request.url.split('/')[-1] + + # Store the screenshot in the key-value store. + await kvs.set_value( + key=f'screenshot-{name}', + value=screenshot, + content_type='image/png', + ) + + # Run the crawler with the initial URLs. + await crawler.run(['https://crawlee.dev']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/guides/code_examples/storages/rq_basic_example.py b/website/versioned_docs/version-0.6/guides/code_examples/storages/rq_basic_example.py new file mode 100644 index 0000000000..9e983bb9fe --- /dev/null +++ b/website/versioned_docs/version-0.6/guides/code_examples/storages/rq_basic_example.py @@ -0,0 +1,31 @@ +import asyncio + +from crawlee.storages import RequestQueue + + +async def main() -> None: + # Open the request queue, if it does not exist, it will be created. + # Leave name empty to use the default request queue. + request_queue = await RequestQueue.open(name='my-request-queue') + + # Add a single request. + await request_queue.add_request('https://apify.com/') + + # Add multiple requests as a batch. + await request_queue.add_requests_batched( + ['https://crawlee.dev/', 'https://crawlee.dev/python/'] + ) + + # Fetch and process requests from the queue. + while request := await request_queue.fetch_next_request(): + # Do something with it... + + # And mark it as handled. + await request_queue.mark_request_as_handled(request) + + # Remove the request queue. + await request_queue.drop() + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/guides/code_examples/storages/rq_with_crawler_example.py b/website/versioned_docs/version-0.6/guides/code_examples/storages/rq_with_crawler_example.py new file mode 100644 index 0000000000..ce6a34cb59 --- /dev/null +++ b/website/versioned_docs/version-0.6/guides/code_examples/storages/rq_with_crawler_example.py @@ -0,0 +1,27 @@ +import asyncio + +from crawlee.crawlers import HttpCrawler, HttpCrawlingContext + + +async def main() -> None: + # Create a new crawler (it can be any subclass of BasicCrawler). Request queue is + # a default request manager, it will be opened, and fully managed if not specified. + crawler = HttpCrawler() + + # Define the default request handler, which will be called for every request. + @crawler.router.default_handler + async def request_handler(context: HttpCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # Use context's add_requests method helper to add new requests from the handler. + await context.add_requests(['https://crawlee.dev/python/']) + + # Use crawler's add_requests method helper to add new requests. + await crawler.add_requests(['https://apify.com/']) + + # Run the crawler. You can optionally pass the list of initial requests. + await crawler.run(['https://crawlee.dev/']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/guides/code_examples/storages/rq_with_crawler_explicit_example.py b/website/versioned_docs/version-0.6/guides/code_examples/storages/rq_with_crawler_explicit_example.py new file mode 100644 index 0000000000..21bedad0b9 --- /dev/null +++ b/website/versioned_docs/version-0.6/guides/code_examples/storages/rq_with_crawler_explicit_example.py @@ -0,0 +1,31 @@ +import asyncio + +from crawlee.crawlers import HttpCrawler, HttpCrawlingContext +from crawlee.storages import RequestQueue + + +async def main() -> None: + # Open the request queue, if it does not exist, it will be created. + # Leave name empty to use the default request queue. + request_queue = await RequestQueue.open(name='my-request-queue') + + # Interact with the request queue directly, e.g. add a batch of requests. + await request_queue.add_requests_batched( + ['https://apify.com/', 'https://crawlee.dev/'] + ) + + # Create a new crawler (it can be any subclass of BasicCrawler) and pass the request + # list as request manager to it. It will be managed by the crawler. + crawler = HttpCrawler(request_manager=request_queue) + + # Define the default request handler, which will be called for every request. + @crawler.router.default_handler + async def request_handler(context: HttpCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # And execute the crawler. + await crawler.run() + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/guides/crawler_login.mdx b/website/versioned_docs/version-0.6/guides/crawler_login.mdx new file mode 100644 index 0000000000..fc02014dde --- /dev/null +++ b/website/versioned_docs/version-0.6/guides/crawler_login.mdx @@ -0,0 +1,41 @@ +--- +id: logging-in-with-a-crawler +title: Logging in with a crawler +description: How to log in to websites with Crawlee. +--- + +import ApiLink from '@site/src/components/ApiLink'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; + +import PlaywrightLogin from '!!raw-loader!roa-loader!./code_examples/login_crawler/playwright_login.py'; +import HttpLogin from '!!raw-loader!roa-loader!./code_examples/login_crawler/http_login.py'; + +Many websites require authentication to access their content. This guide demonstrates how to implement login functionality using both `PlaywrightCrawler` and `HttpCrawler`. + +## Session management for authentication + +When implementing authentication, you'll typically want to maintain the same `Session` throughout your crawl to preserve login state. This requires proper configuration of the `SessionPool`. For more details, see our [session management guide](./session-management). + +If your use case requires multiple authenticated sessions with different credentials, you can: +- Use the `new_session_function` parameter in `SessionPool` to customize session creation. +- Specify the `session_id` parameter in `Request` to bind specific requests to particular sessions. + +For this guide, we'll use [demoqa.com](https://demoqa.com/login), a testing site designed for automation practice that provides a login form and protected content. + +## Login with Playwright crawler + +The following example demonstrates how to authenticate on a website using `PlaywrightCrawler`, which provides browser automation capabilities for filling out logging forms. + + + {PlaywrightLogin} + + +## Login with HTTP crawler + +You can also use `HttpCrawler` (or its more specific variants like `ParselCrawler` or `BeautifulSoupCrawler`) to authenticate by sending a POST `Request` with your credentials directly to the authentication endpoint. + +HTTP-based authentication often varies significantly between websites. Using browser [DevTools](https://developer.chrome.com/docs/devtools/overview) to analyze the `Network` tab during manual login can help you understand the specific authentication flow, required headers, and body parameters for your target website. + + + {HttpLogin} + diff --git a/website/versioned_docs/version-0.6/guides/error_handling.mdx b/website/versioned_docs/version-0.6/guides/error_handling.mdx new file mode 100644 index 0000000000..abd1b33058 --- /dev/null +++ b/website/versioned_docs/version-0.6/guides/error_handling.mdx @@ -0,0 +1,44 @@ +--- +id: error-handling +title: Error handling +description: How to handle errors that occur during web crawling. +--- + +import ApiLink from '@site/src/components/ApiLink'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; + +import HandleProxyError from '!!raw-loader!roa-loader!./code_examples/error_handling/handle_proxy_error.py'; +import ChangeHandleErrorStatus from '!!raw-loader!roa-loader!./code_examples/error_handling/change_handle_error_status.py'; +import DisableRetry from '!!raw-loader!roa-loader!./code_examples/error_handling/disable_retry.py'; + +This guide demonstrates techniques for handling common errors encountered during web crawling operations. + +## Handling proxy errors + +Low-quality proxies can cause problems even with high settings for `max_request_retries` and `max_session_rotations` in `BasicCrawlerOptions`. If you can't get data because of proxy errors, you might want to try again. You can do this using `failed_request_handler`: + + + {HandleProxyError} + + +You can use this same approach when testing different proxy providers. To better manage this process, you can count proxy errors and [stop the crawler](../examples/crawler-stop) if you get too many. + +## Changing how error status codes are handled + +By default, when `Sessions` get status codes like [401](https://developer.mozilla.org/en-US/docs/Web/HTTP/Reference/Status/401), [403](https://developer.mozilla.org/en-US/docs/Web/HTTP/Reference/Status/403), or [429](https://developer.mozilla.org/en-US/docs/Web/HTTP/Reference/Status/429), Crawlee marks the `Session` as `retire` and switches to a new one. This might not be what you want, especially when working with [authentication](./logging-in-with-a-crawler). You can learn more in the [Session management guide](./session-management). + +Here's an example of how to change this behavior: + + + {ChangeHandleErrorStatus} + + +## Turning off retries for non-network errors + +Sometimes you might get unexpected errors when parsing data, like when a website has an unusual structure. Crawlee normally tries again based on your `max_request_retries` setting, but sometimes you don't want that. + +Here's how to turn off retries for non-network errors using `error_handler`, which runs before Crawlee tries again: + + + {DisableRetry} + diff --git a/website/versioned_docs/version-0.6/guides/http_clients.mdx b/website/versioned_docs/version-0.6/guides/http_clients.mdx new file mode 100644 index 0000000000..2d79dabf8d --- /dev/null +++ b/website/versioned_docs/version-0.6/guides/http_clients.mdx @@ -0,0 +1,50 @@ +--- +id: http-clients +title: HTTP clients +description: Crawlee supports multiple HTTP clients when making requests. +--- + +import ApiLink from '@site/src/components/ApiLink'; +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; + +import BsCurlImpersonateExample from '!!raw-loader!roa-loader!./code_examples/http_clients/curl_impersonate_example.py'; +import BsHttpxExample from '!!raw-loader!roa-loader!./code_examples/http_clients/httpx_example.py'; + +HTTP clients are utilized by the HTTP-based crawlers (e.g. `BeautifulSoupCrawler`) to communicate with web servers. They use external HTTP libraries for communication, rather than a browser. Examples of such libraries include [httpx](https://pypi.org/project/httpx/), [aiohttp](https://pypi.org/project/aiohttp/) or [curl-cffi](https://pypi.org/project/curl-cffi/). After retrieving page content, an HTML parsing library is typically used to facilitate data extraction. Examples of such libraries are [beautifulsoup](https://pypi.org/project/beautifulsoup4/), [parsel](https://pypi.org/project/parsel/), [selectolax](https://pypi.org/project/selectolax/) or [pyquery](https://pypi.org/project/pyquery/). These crawlers are faster than browser-based crawlers but they cannot execute client-side JavaScript. + +## How to switch between HTTP clients + +In Crawlee we currently have two HTTP clients: `HttpxHttpClient`, which uses the `httpx` library, and `CurlImpersonateHttpClient`, which uses the `curl-cffi` library. You can switch between them by setting the `http_client` parameter in the Crawler class. The default HTTP client is `HttpxHttpClient`. Below are examples of how to set the HTTP client for the `BeautifulSoupCrawler`. + + + + + {BsHttpxExample} + + + + + {BsCurlImpersonateExample} + + + + +### Installation + +Since `HttpxHttpClient` is the default HTTP client, you don't need to install additional packages to use it. If you want to use `CurlImpersonateHttpClient`, you need to install `crawlee` with the `curl-impersonate` extra. + +```sh +python -m pip install 'crawlee[curl-impersonate]' +``` + +or install all available extras: + +```sh +python -m pip install 'crawlee[all]' +``` + +## How HTTP clients work + +We provide an abstract base class, `HttpClient`, which defines the necessary interface for all HTTP clients. HTTP clients are responsible for sending requests and receiving responses, as well as managing cookies, headers, and proxies. They provide methods that are called from crawlers. To implement your own HTTP client, inherit from the `HttpClient` class and implement the required methods. diff --git a/website/versioned_docs/version-0.6/guides/http_crawlers.mdx b/website/versioned_docs/version-0.6/guides/http_crawlers.mdx new file mode 100644 index 0000000000..42541fe456 --- /dev/null +++ b/website/versioned_docs/version-0.6/guides/http_crawlers.mdx @@ -0,0 +1,38 @@ +--- +id: http-crawlers +title: HTTP crawlers +description: Crawlee supports multiple HTTP crawlers that can be used to extract data from server-rendered webpages. +--- + +import ApiLink from '@site/src/components/ApiLink'; +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +Generic class `AbstractHttpCrawler` is parent to `BeautifulSoupCrawler`, `ParselCrawler` and `HttpCrawler` and it could be used as parent for your crawler with custom content parsing requirements. + +It already includes almost all the functionality to crawl webpages and the only missing part is the parser that should be used to parse HTTP responses, and a context dataclass that defines what context helpers will be available to user handler functions. + +## `BeautifulSoupCrawler` + +`BeautifulSoupCrawler` uses `BeautifulSoupParser` to parse the HTTP response and makes it available in `BeautifulSoupCrawlingContext` in the `.soup` or `.parsed_content` attribute. + +## `ParselCrawler` + +`ParselCrawler` uses `ParselParser` to parse the HTTP response and makes it available in `ParselCrawlingContext` in the `.selector` or `.parsed_content` attribute. + +## `HttpCrawler` + +`HttpCrawler` uses `NoParser` that does not parse the HTTP response at all and is to be used if no parsing is required. + +## Creating your own HTTP crawler + +### Why? + +In case you want to use some custom parser for parsing HTTP responses, and the rest of the `AbstractHttpCrawler` functionality suit your needs. + +### How? + +You need to define at least 2 new classes and decide what will be the type returned by the parser's `parse` method. +Parser will inherit from `AbstractHttpParser` and it will need to implement all it's abstract methods. Crawler will inherit from `AbstractHttpCrawler` and it will need to implement all it's abstract methods. Newly defined parser is then used in the `parser` argument of `AbstractHttpCrawler.__init__` method. + +To get better idea and as an example please see one of our own HTTP-based crawlers mentioned above. diff --git a/website/versioned_docs/version-0.6/guides/playwright_crawler.mdx b/website/versioned_docs/version-0.6/guides/playwright_crawler.mdx new file mode 100644 index 0000000000..124f09e8ad --- /dev/null +++ b/website/versioned_docs/version-0.6/guides/playwright_crawler.mdx @@ -0,0 +1,70 @@ +--- +id: playwright-crawler +title: Playwright crawler +description: How to use the PlaywrightCrawler and its related components. +--- + +import ApiLink from '@site/src/components/ApiLink'; +import CodeBlock from '@theme/CodeBlock'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; + +import MultipleLaunchExample from '!!raw-loader!roa-loader!./code_examples/playwright_crawler/multiple_launch_example.py'; +import BrowserConfigurationExample from '!!raw-loader!roa-loader!./code_examples/playwright_crawler/browser_configuration_example.py'; +import PreNavigationExample from '!!raw-loader!roa-loader!./code_examples/playwright_crawler/pre_navigation_hook_example.py'; + +import PluginBrowserConfigExample from '!!raw-loader!./code_examples/playwright_crawler/plugin_browser_configuration_example.py'; + +A `PlaywrightCrawler` is a browser-based crawler. In contrast to HTTP-based crawlers like `ParselCrawler` or `BeautifulSoupCrawler`, it uses a real browser to render pages and extract data. It is built on top of the [Playwright](https://playwright.dev/python/) browser automation library. While browser-based crawlers are typically slower and less efficient than HTTP-based crawlers, they can handle dynamic, client-side rendered sites that standard HTTP-based crawlers cannot manage. + +## When to use Playwright crawler + +Use `PlaywrightCrawler` in scenarios that require full browser capabilities, such as: + +- **Dynamic content rendering**: Required when pages rely on heavy JavaScript to load or modify content in the browser. +- **Anti-scraping protection**: Helpful for sites using JavaScript-based security or advanced anti-automation measures. +- **Complex cookie management**: Necessary for sites with session or cookie requirements that standard HTTP-based crawlers cannot handle easily. + +If [HTTP-based crawlers](https://crawlee.dev/python/docs/guides/http-crawlers) are insufficient, `PlaywrightCrawler` can address these challenges. See a [basic example](../examples/playwright-crawler) for a typical usage demonstration. + +## Advanced configuration + +The `PlaywrightCrawler` uses other Crawlee components under the hood, notably `BrowserPool` and `PlaywrightBrowserPlugin`. These components let you to configure the browser and context settings, launch multiple browsers, and apply pre-navigation hooks. You can create your own instances of these components and pass them to the `PlaywrightCrawler` constructor. + +- The `PlaywrightBrowserPlugin` manages how browsers are launched and how browser contexts are created. It accepts [browser launch](https://playwright.dev/python/docs/api/class-browsertype#browser-type-launch) and [new context](https://playwright.dev/python/docs/api/class-browser#browser-new-context) options. +- The `BrowserPool` manages the lifecycle of browser instances (launching, recycling, etc.). You can customize its behavior to suit your needs. + +## Managing multiple browsers + +The `BrowserPool` allows you to manage multiple browsers. Each browser instance is managed by a separate `PlaywrightBrowserPlugin` and can be configured independently. This is useful for scenarios like testing multiple configurations or implementing browser rotation to help avoid blocks or detect different site behaviors. + + + {MultipleLaunchExample} + + +## Browser launch and context configuration + +The `PlaywrightBrowserPlugin` provides access to all relevant Playwright configuration options for both [browser launches](https://playwright.dev/python/docs/api/class-browsertype#browser-type-launch) and [new browser contexts](https://playwright.dev/python/docs/api/class-browser#browser-new-context). You can specify these options in the constructor of `PlaywrightBrowserPlugin` or `PlaywrightCrawler`: + + + {BrowserConfigurationExample} + + +You can also configure each plugin used by `BrowserPool`: + + + {PluginBrowserConfigExample} + + +For an example of how to implement a custom browser plugin, see the [Camoufox example](../examples/playwright-crawler-with-camoufox). [Camoufox](https://camoufox.com/) is a stealth browser plugin designed to reduce detection by anti-scraping measures and is fully compatible with `PlaywrightCrawler`. + +## Page configuration with pre-navigation hooks + +In some use cases, you may need to configure the [page](https://playwright.dev/python/docs/api/class-page) before it navigates to the target URL. For instance, you might set navigation timeouts or manipulate other page-level settings. For such cases you can use the `pre_navigation_hook` method of the `PlaywrightCrawler`. This method is called before the page navigates to the target URL and allows you to configure the page instance. + + + {PreNavigationExample} + + +## Conclusion + +This guide introduced the `PlaywrightCrawler` and explained how to configure it using `BrowserPool` and `PlaywrightBrowserPlugin`. You learned how to launch multiple browsers, configure browser and context settings, and apply pre-navigation hooks. If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/crawlee-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping! diff --git a/website/versioned_docs/version-0.6/guides/playwright_crawler_adaptive.mdx b/website/versioned_docs/version-0.6/guides/playwright_crawler_adaptive.mdx new file mode 100644 index 0000000000..696bc15163 --- /dev/null +++ b/website/versioned_docs/version-0.6/guides/playwright_crawler_adaptive.mdx @@ -0,0 +1,94 @@ +--- +id: adaptive-playwright-crawler +title: AdaptivePlaywrightCrawler +description: How to use the AdaptivePlaywrightCrawler. +--- + +import ApiLink from '@site/src/components/ApiLink'; +import CodeBlock from '@theme/CodeBlock'; +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; + +import AdaptivePlaywrightCrawlerHandler from '!!raw-loader!roa-loader!./code_examples/playwright_crawler_adaptive/handler.py'; +import AdaptivePlaywrightCrawlerPreNavHooks from '!!raw-loader!roa-loader!./code_examples/playwright_crawler_adaptive/pre_nav_hooks.py'; + +import AdaptivePlaywrightCrawlerInitBeautifulSoup from '!!raw-loader!./code_examples/playwright_crawler_adaptive/init_beautifulsoup.py'; +import AdaptivePlaywrightCrawlerInitParsel from '!!raw-loader!./code_examples/playwright_crawler_adaptive/init_parsel.py'; +import AdaptivePlaywrightCrawlerInitPrediction from '!!raw-loader!./code_examples/playwright_crawler_adaptive/init_prediction.py'; + +An `AdaptivePlaywrightCrawler` is a combination of `PlaywrightCrawler` and some implementation of HTTP-based crawler such as `ParselCrawler` or `BeautifulSoupCrawler`. +It uses a more limited crawling context interface so that it is able to switch to HTTP-only crawling when it detects that it may bring a performance benefit. + +Detection is done based on the `RenderingTypePredictor` with default implementation `DefaultRenderingTypePredictor`. It predicts which crawling method should be used and learns from already crawled pages. + +## When to use AdaptivePlaywrightCrawler + +Use `AdaptivePlaywrightCrawler` in scenarios where some target pages have to be crawled with `PlaywrightCrawler`, but for others faster HTTP-based crawler is sufficient. This way, you can achieve lower costs when crawling multiple different websites. + +Another use case is performing selector-based data extraction without prior knowledge of whether the selector exists in the static page or is dynamically added by a code executed in a browsing client. + +## Request handler and adaptive context helpers + +Request handler for `AdaptivePlaywrightCrawler` works on special context type - `AdaptivePlaywrightCrawlingContext`. This context is sometimes created by HTTP-based sub crawler and sometimes by playwright based sub crawler. Due to its dynamic nature, you can't always access [page](https://playwright.dev/python/docs/api/class-page) object. To overcome this limitation, there are three helper methods on this context that can be called regardless of how the context was created. + +`wait_for_selector` accepts `css` selector as first argument and timeout as second argument. The function will try to locate this selector a return once it is found(within timeout). In practice this means that if HTTP-based sub crawler was used, the function will find the selector only if it is part of the static content. If not, the adaptive crawler will fall back to the playwright sub crawler and will wait try to locate the selector within the timeout using playwright. + +`query_selector_one` accepts `css` selector as first argument and timeout as second argument. This function acts similar to `wait_for_selector`, but it also returns one selector if any selector is found. Return value type is determined by used HTTP-based sub crawler. For example, it will be `Selector` for `ParselCrawler` and `Tag` for `BeautifulSoupCrawler`. + +`query_selector_all` same as `query_selector_one`, but returns all found selectors. + +`parse_with_static_parser` will re-parse the whole page. Return value type is determined by used HTTP-based sub crawler. It has optional arguments: `selector` and `timeout`. If those optional arguments are used then the function first calls `wait_for_selector` and then do the parsing. This can be used in scenario where some specific element can signal, that page is already complete. + +See the following example about how to create request handler and use context helpers: + + + {AdaptivePlaywrightCrawlerHandler} + + +## Crawler configuration + +To use `AdaptivePlaywrightCrawler` it is recommended to use one of the prepared factory methods that will create the crawler with specific HTTP-based sub crawler variant: `AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser` or `AdaptivePlaywrightCrawler.with_parsel_static_parser`. + +`AdaptivePlaywrightCrawler` is internally composed of two sub crawlers and you can do a detailed configuration of both of them. For detailed configuration options of the sub crawlers, please refer to their pages: `PlaywrightCrawler`, `ParselCrawler`, `BeautifulSoupCrawler`. + +In the following example you can see how to create and configure `AdaptivePlaywrightCrawler` with two different HTTP-based sub crawlers: + + + + + {AdaptivePlaywrightCrawlerInitBeautifulSoup} + + + + + {AdaptivePlaywrightCrawlerInitParsel} + + + + +### Prediction related arguments + +To control which pages are crawled by which method you can use following arguments: + +`RenderingTypePredictor` - Class that can give recommendations about which sub crawler should be used for specific url. Predictor will also recommend to use both sub crawlers for some page from time to time, to check that the given recommendation was correct. Predictor should be able to learn from previous results and gradually give more reliable recommendations. + +`result_checker` - Is a function that checks result created from crawling a page. By default, it always returns `True`. + +`result_comparator` - Is a function that compares two results (HTTP-based sub crawler result and playwright based sub crawler result) and returns `True` if they are considered the same. By default, this function compares calls of context helper `push_data` by each sub crawler. This function is used by `rendering_type_predictor` to evaluate whether HTTP-based crawler has the same results as playwright based sub crawler. + +See the following example about how to pass prediction related arguments: + + + {AdaptivePlaywrightCrawlerInitPrediction} + + +## Page configuration with pre-navigation hooks + +In some use cases, you may need to configure the [page](https://playwright.dev/python/docs/api/class-page) before it navigates to the target URL. For instance, you might set navigation timeouts or manipulate other page-level settings. For such cases you can use the `pre_navigation_hook` method of the `AdaptivePlaywrightCrawler`. This method is called before the page navigates to the target URL and allows you to configure the page instance. Due to the dynamic nature of `AdaptivePlaywrightCrawler` it is possible that the hook will be executed for HTTP-based sub crawler or playwright-based sub crawler. Using [page](https://playwright.dev/python/docs/api/class-page) object for hook that will be executed on HTTP-based sub crawler will raise an exception. To overcome this you can use optional argument `playwright_only` = `True` when registering the hook. + +See the following example about how to register the pre navigation hooks: + + + {AdaptivePlaywrightCrawlerPreNavHooks} + diff --git a/website/versioned_docs/version-0.6/guides/proxy_management.mdx b/website/versioned_docs/version-0.6/guides/proxy_management.mdx new file mode 100644 index 0000000000..38385ac950 --- /dev/null +++ b/website/versioned_docs/version-0.6/guides/proxy_management.mdx @@ -0,0 +1,120 @@ +--- +id: proxy-management +title: Proxy management +description: Using proxies to get around those annoying IP-blocks +--- + +import ApiLink from '@site/src/components/ApiLink'; +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +import CodeBlock from '@theme/CodeBlock'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; + +import QuickStartExample from '!!raw-loader!roa-loader!./code_examples/proxy_management/quick_start_example.py'; +import IntegrationBsExample from '!!raw-loader!roa-loader!./code_examples/proxy_management/integration_bs_example.py'; +import IntegrationPwExample from '!!raw-loader!roa-loader!./code_examples/proxy_management/integration_pw_example.py'; +import TiersBsExample from '!!raw-loader!roa-loader!./code_examples/proxy_management/tiers_bs_example.py'; +import TiersPwExample from '!!raw-loader!roa-loader!./code_examples/proxy_management/tiers_pw_example.py'; +import InspectionBsExample from '!!raw-loader!roa-loader!./code_examples/proxy_management/inspecting_bs_example.py'; +import InspectionPwExample from '!!raw-loader!roa-loader!./code_examples/proxy_management/inspecting_pw_example.py'; + +import SessionBsExample from '!!raw-loader!./code_examples/proxy_management/session_bs_example.py'; +import SessionPwExample from '!!raw-loader!./code_examples/proxy_management/session_pw_example.py'; + +[IP address blocking](https://en.wikipedia.org/wiki/IP_address_blocking) is one of the oldest and most effective ways of preventing access to a website. It is therefore paramount for a good web scraping library to provide easy to use but powerful tools which can work around IP blocking. The most powerful weapon in our anti IP blocking arsenal is a [proxy server](https://en.wikipedia.org/wiki/Proxy_server). + +With Crawlee we can use our own proxy servers or proxy servers acquired from third-party providers. + +[//]: # (Check out the [avoid blocking guide](./avoid-blocking) for more information about blocking.) + +## Quick start + +If you already have proxy URLs of your own, you can start using them immediately in only a few lines of code. + + + {QuickStartExample} + + +Examples of how to use our proxy URLs with crawlers are shown below in [Crawler integration](#crawler-integration) section. + +## Proxy configuration + +All our proxy needs are managed by the `ProxyConfiguration` class. We create an instance using the `ProxyConfiguration` constructor function based on the provided options. + +### Crawler integration + +`ProxyConfiguration` integrates seamlessly into `BeautifulSoupCrawler` and `PlaywrightCrawler`. + + + + + {IntegrationBsExample} + + + + + {IntegrationPwExample} + + + + +Our crawlers will now use the selected proxies for all connections. + +### IP Rotation and session management + +The `proxy_configuration.new_url()` method allows us to pass a `session_id` parameter. This creates a `session_id`-`proxy_url` pair, ensuring that subsequent `new_url()` calls with the same `session_id` return the same `proxy_url`. This is extremely useful in scraping, because we want to create the impression of a real user. See the `SessionPool` class for more information on how maintaining a real session helps avoid blocking. + +For more details on session management, check out the [Session management](./session-management) guide. + +When no `session_id` is provided, our proxy URLs are rotated round-robin. + + + + + {SessionBsExample} + + + + + {SessionPwExample} + + + + +### Tiered proxies + +When you use HTTP proxies in real world crawling scenarios, you have to decide which type of proxy to use to reach the sweet spot between cost efficiency and reliably avoiding blocking. Some websites may allow crawling with no proxy, on some you may get away with using datacenter proxies, which are cheap but easily detected, and sometimes you need to use expensive residential proxies. + +To take the guesswork out of this process, Crawlee allows you to configure multiple tiers of proxy URLs. When crawling, it will automatically pick the lowest tier (smallest index) where it doesn't encounter blocking. If you organize your proxy server URLs in tiers so that the lowest tier contains the cheapest, least reliable ones and each higher tier contains more expensive, more reliable ones, you will get an optimal anti-blocking performance. + +In an active tier, Crawlee will alternate between proxies in a round-robin fashion, just like it would with `proxy_urls`. + + + + + {TiersBsExample} + + + + + {TiersPwExample} + + + + +## Inspecting current proxy in crawlers + +The `BeautifulSoupCrawler` and `PlaywrightCrawler` provide access to information about the currently used proxy via the request handler using a `proxy_info` object. This object allows easy access to the proxy URL. + + + + + {InspectionBsExample} + + + + + {InspectionPwExample} + + + diff --git a/website/versioned_docs/version-0.6/guides/request_loaders.mdx b/website/versioned_docs/version-0.6/guides/request_loaders.mdx new file mode 100644 index 0000000000..73fe374a62 --- /dev/null +++ b/website/versioned_docs/version-0.6/guides/request_loaders.mdx @@ -0,0 +1,142 @@ +--- +id: request-loaders +title: Request loaders +description: How to manage the requests your crawler will go through. +--- + +import ApiLink from '@site/src/components/ApiLink'; +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; + +import RlBasicExample from '!!raw-loader!roa-loader!./code_examples/request_loaders/rl_basic_example.py'; +import TandemExample from '!!raw-loader!roa-loader!./code_examples/request_loaders/tandem_example.py'; +import ExplicitTandemExample from '!!raw-loader!roa-loader!./code_examples/request_loaders/tandem_example_explicit.py'; + +The [`request_loaders`](https://github.com/apify/crawlee-python/tree/master/src/crawlee/request_loaders) sub-package extends the functionality of the `RequestQueue`, providing additional tools for managing URLs. If you are new to Crawlee, and you do not know the `RequestQueue`, consider starting with the [Storages](https://crawlee.dev/python/docs/guides/storages) guide first. Request loaders define how requests are fetched and stored, enabling various use cases, such as reading URLs from files, external APIs or combining multiple sources together. + +## Overview + +The [`request_loaders`](https://github.com/apify/crawlee-python/tree/master/src/crawlee/request_loaders) sub-package introduces the following abstract classes: + +- `RequestLoader`: The base interface for reading requests in a crawl. +- `RequestManager`: Extends `RequestLoader` with write capabilities. +- `RequestManagerTandem`: Combines a read-only `RequestLoader` with a writable `RequestManager`. + +And one specific request loader: + +- `RequestList`: A lightweight implementation of request loader for managing a static list of URLs. + +Below is a class diagram that illustrates the relationships between these components and the `RequestQueue`: + +```mermaid +--- +config: + class: + hideEmptyMembersBox: true +--- + +classDiagram + +%% ======================== +%% Abstract classes +%% ======================== + +class BaseStorage { + <> + + id + + name + + open() + + drop() +} + +class RequestLoader { + <> + + fetch_next_request() + + mark_request_as_handled() + + is_empty() + + is_finished() + + get_handled_count() + + get_total_count() + + to_tandem() +} + +class RequestManager { + <> + + add_request() + + add_requests_batched() + + reclaim_request() + + drop() +} + +%% ======================== +%% Specific classes +%% ======================== + +class RequestQueue { + _attributes_ + _methods_() +} + +class RequestList { + _attributes_ + _methods_() +} + +class RequestManagerTandem { + _attributes_ + _methods_() +} + +%% ======================== +%% Inheritance arrows +%% ======================== + +BaseStorage <|-- RequestQueue +RequestManager <|-- RequestQueue + +RequestLoader <|-- RequestManager +RequestLoader <|-- RequestList +RequestManager <|-- RequestManagerTandem +``` + +## Request loader + +The `RequestLoader` interface defines the foundation for fetching requests during a crawl. It provides abstract methods for basic operations like retrieving, marking, or checking the status of requests. Concrete implementations, such as `RequestList`, build on this interface to handle specific scenarios. You may create your own loader that reads from an external file, a web endpoint, a database or matches some other specific scenario. For more details refer to the `RequestLoader` API reference. + +The `RequestList` can accept an asynchronous generator as input. This allows the requests to be streamed, rather than loading them all into memory at once. This can significantly reduce the memory usage, especially when working with large sets of URLs. + +Here is a basic example of working with the `RequestList`: + + + {RlBasicExample} + + +## Request manager + +The `RequestManager` extends `RequestLoader` with write capabilities. In addition to reading requests, a request manager can add or reclaim them. This is important for dynamic crawling projects, where new URLs may emerge during the crawl process. Or when certain requests may failed and need to be retried. For more details refer to the `RequestManager` API reference. + +## Request manager tandem + +The `RequestManagerTandem` class allows you to combine the read-only capabilities `RequestLoader` (like `RequestList`) with read-write capabilities of a `RequestManager` (like `RequestQueue`). This is useful for scenarios where you need to load initial requests from a static source (like a file or database) and dynamically add or retry requests during the crawl. Additionally, it provides deduplication capabilities, ensuring that requests are not processed multiple times. Under the hood, `RequestManagerTandem` checks whether the read-only loader still has pending requests. If so, each new request from the loader is transferred to the manager. Any newly added or reclaimed requests go directly to the manager side. + +### Request list with request queue + +This sections describes the combination of the `RequestList` and `RequestQueue` classes. This setup is particularly useful when you have a static list of URLs that you want to crawl, but you also need to handle dynamic requests during the crawl process. The `RequestManagerTandem` class facilitates this combination, with the `RequestLoader.to_tandem` method available as a convenient shortcut. Requests from the `RequestList` are processed first by enqueuing them into the default `RequestQueue`, which handles persistence and retries failed requests. + + + + + {ExplicitTandemExample} + + + + + {TandemExample} + + + + +## Conclusion + +This guide explained the `request_loaders` sub-package, which extends the functionality of the `RequestQueue` with additional tools for managing URLs. You learned about the `RequestLoader`, `RequestManager`, and `RequestManagerTandem` classes, as well as the `RequestList` class. You also saw examples of how to work with these classes in practice. If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/crawlee-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping! diff --git a/website/versioned_docs/version-0.6/guides/running_in_web_server.mdx b/website/versioned_docs/version-0.6/guides/running_in_web_server.mdx new file mode 100644 index 0000000000..63f907e616 --- /dev/null +++ b/website/versioned_docs/version-0.6/guides/running_in_web_server.mdx @@ -0,0 +1,47 @@ +--- +id: running-in-web-server +title: Running in web server +description: Running in web server +--- + +import ApiLink from '@site/src/components/ApiLink'; +import CodeBlock from '@theme/CodeBlock'; + +import Crawler from '!!raw-loader!./code_examples/running_in_web_server/crawler.py'; +import Server from '!!raw-loader!./code_examples/running_in_web_server/server.py'; + + +Most of the time, Crawlee jobs are run as batch jobs. You have a list of URLs you want to scrape every week or you might want to scrape a whole website once per day. After the scrape, you send the data to your warehouse for analytics. Batch jobs are efficient because they can use Crawlee's built-in autoscaling to fully utilize the resources you have available. But sometimes you have a use-case where you need to return scrape data as soon as possible. There might be a user waiting on the other end so every millisecond counts. This is where running Crawlee in a web server comes in. + +We will build a simple HTTP server that receives a page URL and returns the page title in the response. + +## Set up a web server + +There are many popular web server frameworks for Python, such as [Flask](https://flask.palletsprojects.com/en/stable/), [Django](https://www.djangoproject.com/), [Pyramid](https://trypyramid.com/), ... In this guide, we will use the [FastAPI](https://fastapi.tiangolo.com/) to keep things simple. + +This will be our core server setup: + + + {Server} + + +The server has two endpoints. +- `/` - The index is just giving short description of the server with example link to the second endpoint. +- `/scrape` - This is the endpoint that receives a `url` parameter and returns the page title scraped from the URL + +To run the example server, make sure that you have installed the [fastapi[standard]](https://fastapi.tiangolo.com/#installation) and from the directory where the example code is located you can use the following command: +``` +fastapi dev server.py +``` + +## Create a crawler + +We will create a standard `ParselCrawler` and use the `keep_alive=true` option to keep the crawler running even if there are no requests currently in the `RequestQueue`. This way it will always be waiting for new requests to come in. + + + {Crawler} + + +Crawler is defined inside of [Lifespan](https://fastapi.tiangolo.com/advanced/events/#lifespan) which is a FastAPI way to run some start up/ teardown code for the app. There are two objects that we want to save to the app state so that they can be accessed in any endpoint through `request.state`: +- `crawler` holds instance of our crawler and allows the app to interact with it. +- `requests_to_results` is dictionary that is used to temporarily register expected results for each request and populate them when they are made available by the crawler. diff --git a/website/versioned_docs/version-0.6/guides/scaling_crawlers.mdx b/website/versioned_docs/version-0.6/guides/scaling_crawlers.mdx new file mode 100644 index 0000000000..5dce8ac640 --- /dev/null +++ b/website/versioned_docs/version-0.6/guides/scaling_crawlers.mdx @@ -0,0 +1,49 @@ +--- +id: scaling-crawlers +title: Scaling crawlers +description: Learn how to scale your crawlers by controlling concurrency and limiting requests per minute. +--- + +import ApiLink from '@site/src/components/ApiLink'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; + +import MaxTasksPerMinuteExample from '!!raw-loader!roa-loader!./code_examples/scaling_crawlers/max_tasks_per_minute_example.py'; +import MinAndMaxConcurrencyExample from '!!raw-loader!roa-loader!./code_examples/scaling_crawlers/min_and_max_concurrency_example.py'; + +As we build our crawler, we may want to control how many tasks it performs at any given time. In other words, how many requests it makes to the web we are trying to scrape. Crawlee offers several options to fine-tune the number of parallel tasks, limit the number of requests per minute, and optimize scaling based on available system resources. + +:::tip + +All of these options are available across all crawlers provided by Crawlee. In this guide, we are using the `BeautifulSoupCrawler` as an example. You should also explore the `ConcurrencySettings`. + +::: + +## Max tasks per minute + +The `max_tasks_per_minute` setting in `ConcurrencySettings` controls how many total tasks the crawler can process per minute. It ensures that tasks are spread evenly throughout the minute, preventing a sudden burst at the `max_concurrency` limit followed by idle time. By default, this is set to `Infinity`, meaning the crawler can run at full speed, limited only by `max_concurrency`. Use this option if you want to throttle your crawler to avoid overwhelming the target website with continuous requests. + + + {MaxTasksPerMinuteExample} + + +## Minimum and maximum concurrency + +The `min_concurrency` and `max_concurrency` options in the `ConcurrencySettings` define the minimum and maximum number of parallel tasks that can run at any given time. By default, crawlers start with a single parallel task and gradually scale up to a maximum of concurrent requests. + +:::caution Avoid setting minimum concurrency too high + +If you set `min_concurrency` too high compared to the available system resources, the crawler may run very slowly or even crash. It is recommended to stick with the default value and let the crawler automatically adjust concurrency based on the system's available resources. + +::: + +## Desired concurrency + +The `desired_concurrency` option in the `ConcurrencySettings` specifies the initial number of parallel tasks to start with, assuming sufficient resources are available. It defaults to the same value as `min_concurrency`. + + + {MinAndMaxConcurrencyExample} + + +## Autoscaled pool + +The `AutoscaledPool` manages a pool of asynchronous, resource-intensive tasks that run in parallel. It automatically starts new tasks only when there is enough free CPU and memory. To monitor system resources, it leverages the `Snapshotter` and `SystemStatus` classes. If any task raises an exception, the error is propagated, and the pool is stopped. Every crawler uses an `AutoscaledPool` under the hood. diff --git a/website/versioned_docs/version-0.6/guides/session_management.mdx b/website/versioned_docs/version-0.6/guides/session_management.mdx new file mode 100644 index 0000000000..a3a1385db1 --- /dev/null +++ b/website/versioned_docs/version-0.6/guides/session_management.mdx @@ -0,0 +1,94 @@ +--- +id: session-management +title: Session management +description: How to manage your cookies, proxy IP rotations and more. +--- + +import ApiLink from '@site/src/components/ApiLink'; +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; + +import BasicSource from '!!raw-loader!roa-loader!./code_examples/session_management/sm_basic.py'; +import HttpSource from '!!raw-loader!roa-loader!./code_examples/session_management/sm_http.py'; +import BeautifulSoupSource from '!!raw-loader!roa-loader!./code_examples/session_management/sm_beautifulsoup.py'; +import ParselSource from '!!raw-loader!roa-loader!./code_examples/session_management/sm_parsel.py'; +import PlaywrightSource from '!!raw-loader!roa-loader!./code_examples/session_management/sm_playwright.py'; +import StandaloneSource from '!!raw-loader!roa-loader!./code_examples/session_management/sm_standalone.py'; +import OneSession from '!!raw-loader!roa-loader!./code_examples/session_management/one_session_http.py'; +import MultiSessions from '!!raw-loader!roa-loader!./code_examples/session_management/multi_sessions_http.py'; + +The `SessionPool` class provides a robust way to manage the rotation of proxy IP addresses, cookies, and other custom settings in Crawlee. Its primary advantage is the ability to filter out blocked or non-functional proxies, ensuring that your scraper avoids retrying requests through known problematic proxies. + +Additionally, it enables storing information tied to specific IP addresses, such as cookies, authentication tokens, and custom headers. This association reduces the probability of detection and blocking by ensuring cookies and other identifiers are used consistently with the same IP address. + +Finally, it ensures even IP address rotation by randomly selecting sessions. This helps prevent overuse of a limited pool of available IPs, reducing the risk of IP bans and enhancing the efficiency of your scraper. + +For more details on configuring proxies, refer to the [Proxy management](./proxy-management) guide. + +Now, let's explore examples of how to use the `SessionPool` in different scenarios: +- with `BasicCrawler`; +- with `HttpCrawler`; +- with `BeautifulSoupCrawler`; +- with `ParselCrawler`; +- with `PlaywrightCrawler`; +- without a crawler (standalone usage to manage sessions manually). + + + + + {BasicSource} + + + + + {HttpSource} + + + + + {BeautifulSoupSource} + + + + + {ParselSource} + + + + + {PlaywrightSource} + + + + + {StandaloneSource} + + + + +These examples demonstrate the basics of configuring and using the `SessionPool`. + +Please, bear in mind that `SessionPool` requires some time to establish a stable pool of working IPs. During the initial setup, you may encounter errors as the pool identifies and filters out blocked or non-functional IPs. This stabilization period is expected and will improve over time. + +## Configuring a single session + +In some cases, you need full control over session usage. For example, when working with websites requiring authentication or initialization of certain parameters like cookies. + +When working with a site that requires authentication, we typically don't want multiple sessions with different browser fingerprints or client parameters accessing the site. In this case, we need to configure the `SessionPool` appropriately: + + + {OneSession} + + +## Binding requests to specific sessions + +In the previous example, there's one obvious limitation - you're restricted to only one session. + +In some cases, we need to achieve the same behavior but using multiple sessions in parallel, such as authenticating with different profiles or using different proxies. + +To do this, use the `session_id` parameter for the `Request` object to bind a request to a specific session: + + + {MultiSessions} + diff --git a/website/versioned_docs/version-0.6/guides/storages.mdx b/website/versioned_docs/version-0.6/guides/storages.mdx new file mode 100644 index 0000000000..3be168b683 --- /dev/null +++ b/website/versioned_docs/version-0.6/guides/storages.mdx @@ -0,0 +1,221 @@ +--- +id: storages +title: Storages +description: How to work with storages in Crawlee, how to manage requests and how to store and retrieve scraping results. +--- + +import ApiLink from '@site/src/components/ApiLink'; +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; + +import RqBasicExample from '!!raw-loader!roa-loader!./code_examples/storages/rq_basic_example.py'; +import RqWithCrawlerExample from '!!raw-loader!roa-loader!./code_examples/storages/rq_with_crawler_example.py'; +import RqWithCrawlerExplicitExample from '!!raw-loader!roa-loader!./code_examples/storages/rq_with_crawler_explicit_example.py'; +import RqHelperAddRequestsExample from '!!raw-loader!roa-loader!./code_examples/storages/helper_add_requests_example.py'; +import RqHelperEnqueueLinksExample from '!!raw-loader!roa-loader!./code_examples/storages/helper_enqueue_links_example.py'; + +import DatasetBasicExample from '!!raw-loader!roa-loader!./code_examples/storages/dataset_basic_example.py'; +import DatasetWithCrawlerExample from '!!raw-loader!roa-loader!./code_examples/storages/dataset_with_crawler_example.py'; +import DatasetWithCrawerExplicitExample from '!!raw-loader!roa-loader!./code_examples/storages/dataset_with_crawler_explicit_example.py'; + +import KvsBasicExample from '!!raw-loader!roa-loader!./code_examples/storages/kvs_basic_example.py'; +import KvsWithCrawlerExample from '!!raw-loader!roa-loader!./code_examples/storages/kvs_with_crawler_example.py'; +import KvsWithCrawlerExplicitExample from '!!raw-loader!roa-loader!./code_examples/storages/kvs_with_crawler_explicit_example.py'; + +import CleaningDoNotPurgeExample from '!!raw-loader!roa-loader!./code_examples/storages/cleaning_do_not_purge_example.py'; +import CleaningPurgeExplicitlyExample from '!!raw-loader!roa-loader!./code_examples/storages/cleaning_purge_explicitly_example.py'; + +Crawlee offers multiple storage types for managing and persisting your crawling data. Request-oriented storages, such as the `RequestQueue`, help you store and deduplicate URLs, while result-oriented storages, like `Dataset` and `KeyValueStore`, focus on storing and retrieving scraping results. This guide helps you choose the storage type that suits your needs. + +## Storage clients + +Storage clients in Crawlee are subclasses of `StorageClient`. They handle interactions with different storage backends. For instance: + +- `MemoryStorageClient`: Stores data in memory and persists it to the local file system. +- [`ApifyStorageClient`](https://docs.apify.com/sdk/python/reference/class/ApifyStorageClient): Manages storage on the [Apify platform](https://apify.com). Apify storage client is implemented in the [Apify SDK](https://github.com/apify/apify-sdk-python). + +Each storage client is responsible for maintaining the storages in a specific environment. This abstraction makes it easier to switch between different environments, e.g. between local development and cloud production setup. + +### Memory storage client + +The `MemoryStorageClient` is the default and currently the only one storage client in Crawlee. It stores data in memory and persists it to the local file system. The data are stored in the following directory structure: + +```text +{CRAWLEE_STORAGE_DIR}/{storage_type}/{STORAGE_ID}/ +``` + +where: + +- `{CRAWLEE_STORAGE_DIR}`: The root directory for local storage, specified by the `CRAWLEE_STORAGE_DIR` environment variable (default: `./storage`). +- `{storage_type}`: The type of storage (e.g., `datasets`, `key_value_stores`, `request_queues`). +- `{STORAGE_ID}`: The ID of the specific storage instance (default: `default`). + +:::info NOTE +The current `MemoryStorageClient` and its interface is quite old and not great. We plan to refactor it, together with the whole `StorageClient` interface in the near future and it better and and easier to use. We also plan to introduce new storage clients for different storage backends - e.g. for [SQLite](https://sqlite.org/). +::: + +You can override default storage IDs using these environment variables: `CRAWLEE_DEFAULT_DATASET_ID`, `CRAWLEE_DEFAULT_KEY_VALUE_STORE_ID`, or `CRAWLEE_DEFAULT_REQUEST_QUEUE_ID`. + +## Request queue + +The `RequestQueue` is the primary storage for URLs in Crawlee, especially useful for deep crawling. It supports dynamic addition and removal of URLs, making it ideal for recursive tasks where URLs are discovered and added during the crawling process (e.g., following links across multiple pages). Each Crawlee project has a **default request queue**, which can be used to store URLs during a specific run. The `RequestQueue` is highly useful for large-scale and complex crawls. + +By default, data are stored using the following path structure: + +```text +{CRAWLEE_STORAGE_DIR}/request_queues/{QUEUE_ID}/{INDEX}.json +``` + +- `{CRAWLEE_STORAGE_DIR}`: The root directory for all storage data, specified by the environment variable. +- `{QUEUE_ID}`: The ID of the request queue, "default" by default. +- `{INDEX}`: Represents the zero-based index of the record within the queue. + +The following code demonstrates the usage of the `RequestQueue`: + + + + + {RqBasicExample} + + + + + {RqWithCrawlerExample} + + + + + {RqWithCrawlerExplicitExample} + + + + +### Request-related helpers + +Crawlee provides helper functions to simplify interactions with the `RequestQueue`: + +- The `add_requests` function allows you to manually add specific URLs to the configured request storage. In this case, you must explicitly provide the URLs you want to be added to the request storage. If you need to specify further details of the request, such as a `label` or `user_data`, you have to pass instances of the `Request` class to the helper. +- The `enqueue_links` function is designed to discover new URLs in the current page and add them to the request storage. It can be used with default settings, requiring no arguments, or you can customize its behavior by specifying link element selectors, choosing different enqueue strategies, or applying include/exclude filters to control which URLs are added. See [Crawl website with relative links](../examples/crawl-website-with-relative-links) example for more details. + + + + + {RqHelperAddRequestsExample} + + + + + {RqHelperEnqueueLinksExample} + + + + +### Request manager + +The `RequestQueue` implements the `RequestManager` interface, offering a unified API for interacting with various request storage types. This provides a unified way to interact with different request storage types. + +If you need custom functionality, you can create your own request storage by subclassing the `RequestManager` class and implementing its required methods. + +For a detailed explanation of the `RequestManager` and other related components, refer to the [Request loaders guide](https://crawlee.dev/python/docs/guides/request-loaders). + +## Dataset + +The `Dataset` is designed for storing structured data, where each entry has a consistent set of attributes, such as products in an online store or real estate listings. Think of a `Dataset` as a table: each entry corresponds to a row, with attributes represented as columns. Datasets are append-only, allowing you to add new records but not modify or delete existing ones. Every Crawlee project run is associated with a default dataset, typically used to store results specific to that crawler execution. However, using this dataset is optional. + +By default, data are stored using the following path structure: + +```text +{CRAWLEE_STORAGE_DIR}/datasets/{DATASET_ID}/{INDEX}.json +``` +- `{CRAWLEE_STORAGE_DIR}`: The root directory for all storage data specified by the environment variable. +- `{DATASET_ID}`: The dataset's ID, "default" by default. +- `{INDEX}`: Represents the zero-based index of the record within the dataset. + +The following code demonstrates basic operations of the dataset: + + + + + {DatasetBasicExample} + + + + + {DatasetWithCrawlerExample} + + + + + {DatasetWithCrawerExplicitExample} + + + + +### Dataset-related helpers + +Crawlee provides the following helper function to simplify interactions with the `Dataset`: + +- The `push_data` function allows you to manually add data to the dataset. You can optionally specify the dataset ID or its name. + +## Key-value store + +The `KeyValueStore` is designed to save and retrieve data records or files efficiently. Each record is uniquely identified by a key and is associated with a specific MIME type, making the `KeyValueStore` ideal for tasks like saving web page screenshots, PDFs, or tracking the state of crawlers. + +By default, data are stored using the following path structure: + +```text +{CRAWLEE_STORAGE_DIR}/key_value_stores/{STORE_ID}/{KEY}.{EXT} +``` +- `{CRAWLEE_STORAGE_DIR}`: The root directory for all storage data specified by the environment variable. +- `{STORE_ID}`: The KVS's ID, "default" by default. +- `{KEY}`: The unique key for the record. +- `{EXT}`: The file extension corresponding to the MIME type of the content. + +The following code demonstrates the usage of the `KeyValueStore`: + + + + + {KvsBasicExample} + + + + + {KvsWithCrawlerExample} + + + + + {KvsWithCrawlerExplicitExample} + + + + +To see a real-world example of how to get the input from the key-value store, see the [Screenshots](https://crawlee.dev/python/docs/examples/capture-screenshots-using-playwright) example. + +### Key-value store-related helpers + +Crawlee provides the following helper function to simplify interactions with the `KeyValueStore`: + +- The `get_key_value_store` function retrieves the key-value store for the current crawler run. If the KVS does not exist, it will be created. You can also specify the KVS's ID or its name. + +## Cleaning up the storages + +Default storages are purged before the crawler starts, unless explicitly configured otherwise. For that case, see `Configuration.purge_on_start`. This cleanup happens as soon as a storage is accessed, either when you open a storage (e.g. using `RequestQueue.open`, `Dataset.open`, `KeyValueStore.open`) or when interacting with a storage through one of the helper functions (e.g. `push_data`), which implicitly opens the result storage. + + + {CleaningDoNotPurgeExample} + + +If you do not explicitly interact with storages in your code, the purging will occur automatically when the `BasicCrawler.run` method is invoked. + +If you need to purge storages earlier, you can call `MemoryStorageClient.purge_on_start` directly if you are using the default storage client. This method triggers the purging process for the underlying storage implementation you are currently using. + + + {CleaningPurgeExplicitlyExample} + + +## Conclusion + +This guide introduced you to the different storage types available in Crawlee and how to interact with them. You learned how to manage requests and store and retrieve scraping results using the `RequestQueue`, `Dataset`, and `KeyValueStore`. You also discovered how to use helper functions to simplify interactions with these storages. Finally, you learned how to clean up storages before starting a crawler run and how to purge them explicitly. If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/crawlee-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping! diff --git a/website/versioned_docs/version-0.6/introduction/01_setting_up.mdx b/website/versioned_docs/version-0.6/introduction/01_setting_up.mdx new file mode 100644 index 0000000000..cc67f33c1f --- /dev/null +++ b/website/versioned_docs/version-0.6/introduction/01_setting_up.mdx @@ -0,0 +1,153 @@ +--- +id: setting-up +title: Setting up +--- + +import ApiLink from '@site/src/components/ApiLink'; +import CodeBlock from '@theme/CodeBlock'; +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +This guide will help you get started with Crawlee by setting it up on your computer. Follow the steps below to ensure a smooth installation process. + +## Prerequisites + +Before installing Crawlee itself, make sure that your system meets the following requirements: + +- **Python 3.9 or higher**: Crawlee requires Python 3.9 or a newer version. You can download Python from the [official website](https://python.org/downloads/). +- **Python package manager**: While this guide uses [pip](https://pip.pypa.io/) (the most common package manager), you can also use any package manager you want. You can download pip from the [official website](https://pip.pypa.io/en/stable/installation/). + +### Verifying prerequisites + +To check if Python and pip are installed, run the following commands: + +```sh +python --version +``` + +```sh +python -m pip --version +``` + +If these commands return the respective versions, you're ready to continue. + +## Installing Crawlee + +Crawlee is available as [`crawlee`](https://pypi.org/project/crawlee/) package on PyPI. This package includes the core functionality, while additional features are available as optional extras to keep dependencies and package size minimal. + +### Basic installation + +To install the core package, run: + +```sh +python -m pip install crawlee +``` + +After installation, verify that Crawlee is installed correctly by checking its version: + +```sh +python -c 'import crawlee; print(crawlee.__version__)' +``` + +### Full installation + +If you do not mind the package size, you can run the following command to install Crawlee with all optional features: + +```sh +python -m pip install 'crawlee[all]' +``` + +### Installing specific extras + +Depending on your use case, you may want to install specific extras to enable additional functionality: + +For using the `BeautifulSoupCrawler`, install the `beautifulsoup` extra: + +```sh +python -m pip install 'crawlee[beautifulsoup]' +``` + +For using the `ParselCrawler`, install the `parsel` extra: + +```sh +python -m pip install 'crawlee[parsel]' +``` + +For using the `CurlImpersonateHttpClient`, install the `curl-impersonate` extra: + +```sh +python -m pip install 'crawlee[curl-impersonate]' +``` + +If you plan to use a (headless) browser with `PlaywrightCrawler`, install Crawlee with the `playwright` extra: + +```sh +python -m pip install 'crawlee[playwright]' +``` + +After installing the playwright extra, install the necessary Playwright dependencies: + +```sh +playwright install +``` + +### Installing multiple extras + +You can install multiple extras at once by using a comma as a separator: + +```sh +python -m pip install 'crawlee[beautifulsoup,curl-impersonate]' +``` + +## Start a new project + +The quickest way to get started with Crawlee is by using the Crawlee CLI and selecting one of the prepared templates. The CLI helps you set up a new project in seconds. + +### Using Crawlee CLI with Pipx + +First, ensure you have Pipx installed. You can check if Pipx is installed by running: + +```sh +pipx --version +``` + +If Pipx is not installed, follow the official [installation guide](https://pipx.pypa.io/stable/installation/). + +Then, run the Crawlee CLI using Pipx and choose from the available templates: + +```sh +pipx run 'crawlee[cli]' create my-crawler +``` + +### Using Crawlee CLI directly + +If you already have `crawlee` installed, you can spin it up by running: + +```sh +crawlee create my_crawler +``` + +Follow the interactive prompts in the CLI to choose a crawler type and set up your new project. + +### Running your project + +To run your newly created project, navigate to the project directory, activate the virtual environment, and execute the Python interpreter with the project module: + + + + cd my_crawler/ + source .venv/bin/activate + python -m my_crawler + + + cd my_crawler/ + venv\Scripts\activate + python -m my_crawler + + + +Congratulations! You have successfully set up and executed your first Crawlee project. + +## Next steps + +Next, you will learn how to create a very simple crawler and Crawlee components while building it. diff --git a/website/versioned_docs/version-0.6/introduction/02_first_crawler.mdx b/website/versioned_docs/version-0.6/introduction/02_first_crawler.mdx new file mode 100644 index 0000000000..203ab92146 --- /dev/null +++ b/website/versioned_docs/version-0.6/introduction/02_first_crawler.mdx @@ -0,0 +1,95 @@ +--- +id: first-crawler +title: First crawler +--- + +import ApiLink from '@site/src/components/ApiLink'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; + +import RequestQueueExample from '!!raw-loader!roa-loader!./code_examples/02_request_queue.py'; +import BeautifulSoupExample from '!!raw-loader!roa-loader!./code_examples/02_bs.py'; +import BeautifulSoupBetterExample from '!!raw-loader!roa-loader!./code_examples/02_bs_better.py'; + +Now, you will build your first crawler. But before you do, let's briefly introduce the Crawlee classes involved in the process. + +## How Crawlee works + +There are 3 main crawler classes available for use in Crawlee. + +- `BeautifulSoupCrawler` +- `ParselCrawler` +- `PlaywrightCrawler` + +We'll talk about their differences later. Now, let's talk about what they have in common. + +The general idea of each crawler is to go to a web page, open it, do some stuff there, save some results, continue to the next page, and repeat this process until the crawler's done its job. So the crawler always needs to find answers to two questions: _Where should I go?_ and _What should I do there?_ Answering those two questions is the only required setup. The crawlers have reasonable defaults for everything else. + +### The where - `Request` and `RequestQueue` + +All crawlers use instances of the `Request` class to determine where they need to go. Each request may hold a lot of information, but at the very least, it must hold a URL - a web page to open. But having only one URL would not make sense for crawling. Sometimes you have a pre-existing list of your own URLs that you wish to visit, perhaps a thousand. Other times you need to build this list dynamically as you crawl, adding more and more URLs to the list as you progress. Most of the time, you will use both options. + +The requests are stored in a `RequestQueue`, a dynamic queue of `Request` instances. You can seed it with start URLs and also add more requests while the crawler is running. This allows the crawler to open one page, extract interesting data, such as links to other pages on the same domain, add them to the queue (called _enqueuing_) and repeat this process to build a queue of virtually unlimited number of URLs. + +### The what - request handler + +In the request handler you tell the crawler what to do at each and every page it visits. You can use it to handle extraction of data from the page, processing the data, saving it, calling APIs, doing calculations and so on. + +The request handler is a user-defined function, invoked automatically by the crawler for each `Request` from the `RequestQueue`. It always receives a single argument - `BasicCrawlingContext` (or its descendants). Its properties change depending on the crawler class used, but it always includes the `request` property, which represents the currently crawled URL and related metadata. + +## Building a crawler + +Let's put the theory into practice and start with something easy. Visit a page and get its HTML title. In this tutorial, you'll scrape the Crawlee website [https://crawlee.dev](https://crawlee.dev), but the same code will work for any website. + +### Adding requests to the crawling queue + +Earlier you learned that the crawler uses a queue of requests as its source of URLs to crawl. Let's create it and add the first request. + + + {RequestQueueExample} + + +The `RequestQueue.add_request` method automatically converts the object with URL string to a `Request` instance. So now you have a `RequestQueue` that holds one request which points to `https://crawlee.dev`. + +:::tip Bulk add requests + +The code above is for illustration of the request queue concept. Soon you'll learn about the `BasicCrawler.add_requests` method which allows you to skip this initialization code, and it also supports adding a large number of requests without blocking. + +::: + +### Building a BeautifulSoupCrawler + +Crawlee comes with three main crawler classes: `BeautifulSoupCrawler`, `ParselCrawler`, and `PlaywrightCrawler`. You can read their short descriptions in the [Quick start](../quick-start) lesson. + +Unless you have a good reason to start with a different one, you should try building a `BeautifulSoupCrawler` first. It is an HTTP crawler with HTTP2 support, anti-blocking features and integrated HTML parser - [BeautifulSoup](https://pypi.org/project/beautifulsoup4/). It's fast, simple, cheap to run and does not require complicated dependencies. The only downside is that it won't work out of the box for websites which require JavaScript rendering. But you might not need JavaScript rendering at all, because many modern websites use server-side rendering. + +Let's continue with the earlier `RequestQueue` example. + + + {BeautifulSoupExample} + + +When you run the example, you will see the title of https://crawlee.dev printed to the log. What really happens is that `BeautifulSoupCrawler` first makes an HTTP request to `https://crawlee.dev`, then parses the received HTML with BeautifulSoup and makes it available as the `context` argument of the request handler. + +```log +[__main__] INFO The title of "https://crawlee.dev" is "Crawlee ยท Build reliable crawlers. Fast. | Crawlee". +``` + +### Add requests faster + +Earlier we mentioned that you'll learn how to use the `BasicCrawler.add_requests` method to skip the request queue initialization. It's simple. Every crawler has an implicit `RequestQueue` instance, and you can add requests to it with the `BasicCrawler.add_requests` method. In fact, you can go even further and just use the first parameter of `crawler.run()`! + + + {BeautifulSoupBetterExample} + + +When you run this code, you'll see exactly the same output as with the earlier, longer example. The `RequestQueue` is still there, it's just managed by the crawler automatically. + +:::info + +This method not only makes the code shorter, it will help with performance too! Internally it calls `RequestQueue.add_requests_batched` method. It will wait only for the initial batch of 1000 requests to be added to the queue before resolving, which means the processing will start almost instantly. After that, it will continue adding the rest of the requests in the background (again, in batches of 1000 items, once every second). + +::: + +## Next steps + +Next, you'll learn about crawling links. That means finding new URLs on the pages you crawl and adding them to the `RequestQueue` for the crawler to visit. diff --git a/website/versioned_docs/version-0.6/introduction/03_adding_more_urls.mdx b/website/versioned_docs/version-0.6/introduction/03_adding_more_urls.mdx new file mode 100644 index 0000000000..a9669fb8a3 --- /dev/null +++ b/website/versioned_docs/version-0.6/introduction/03_adding_more_urls.mdx @@ -0,0 +1,120 @@ +--- +id: adding-more-urls +title: Adding more URLs +--- + +import ApiLink from '@site/src/components/ApiLink'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; + +import OriginalCodeExample from '!!raw-loader!roa-loader!./code_examples/03_original_code.py'; +import FindingNewLinksExample from '!!raw-loader!roa-loader!./code_examples/03_finding_new_links.py'; +import EnqueueStrategyExample from '!!raw-loader!roa-loader!./code_examples/03_enqueue_strategy.py'; +import GlobsExample from '!!raw-loader!roa-loader!./code_examples/03_globs.py'; +import TransformExample from '!!raw-loader!roa-loader!./code_examples/03_transform_request.py'; + +Previously you've built a very simple crawler that downloads HTML of a single page, reads its title and prints it to the console. This is the original source code: + + + {OriginalCodeExample} + + +Now you'll use the example from the previous section and improve on it. You'll add more URLs to the queue and thanks to that the crawler will keep going, finding new links, enqueuing them into the `RequestQueue` and then scraping them. + +## How crawling works + +The process is simple: + +1. Find new links on the page. +2. Filter only those pointing to the same domain, in this case [crawlee.dev](https://crawlee.dev/). +3. Enqueue (add) them to the `RequestQueue`. +4. Visit the newly enqueued links. +5. Repeat the process. + +In the following paragraphs you will learn about the `enqueue_links` function which simplifies crawling to a single function call. + +:::tip context awareness + +The `enqueue_links` function is context aware. It means that it will read the information about the currently crawled page from the context, and you don't need to explicitly provide any arguments. However, you can specify filtering criteria or an enqueuing strategy if desired. It will find the links and automatically add the links to the running crawler's `RequestQueue`. + +::: + +## Limit your crawls + +When you're just testing your code or when your crawler could potentially find millions of links, it's very useful to set a maximum limit of crawled pages. The option is called `max_requests_per_crawl`, is available in all crawlers, and you can set it like this: + +```python +crawler = BeautifulSoupCrawler(max_requests_per_crawl=20) +``` + +This means that no new requests will be started after the 20th request is finished. The actual number of processed requests might be a little higher thanks to parallelization, because the running requests won't be forcefully aborted. It's not even possible in most cases. + +## Finding new links + +There are numerous approaches to finding links to follow when crawling the web. For our purposes, we will be looking for `` elements that contain the `href` attribute because that's what you need in most cases. For example: + +```html +This is a link to Crawlee introduction +``` + +Since this is the most common case, it is also the `enqueue_links` default. + + + {FindingNewLinksExample} + + +If you need to override the default selection of elements in `enqueue_links`, you can use the `selector` argument. + +```python +await context.enqueue_links(selector='a.article-link') +``` + +## Filtering links to same domain + +Websites typically contain a lot of links that lead away from the original page. This is normal, but when crawling a website, we usually want to crawl that one site and not let our crawler wander away to Google, Facebook and Twitter. Therefore, we need to filter out the off-domain links and only keep the ones that lead to the same domain. + +```python +# The default behavior of enqueue_links is to stay on the same hostname, so it does not require +# any parameters. This will ensure the subdomain stays the same. +await context.enqueue_links() +``` + +The default behavior of `enqueue_links` is to stay on the same hostname. This **does not include subdomains**. To include subdomains in your crawl, use the `strategy` argument. The `strategy` argument is an instance of the `EnqueueStrategy` type alias. + + + {EnqueueStrategyExample} + + +When you run the code, you will see the crawler log the **title** of the first page, then the **enqueueing** message showing number of URLs, followed by the **title** of the first enqueued page and so on and so on. + +## Skipping duplicate URLs + +Skipping of duplicate URLs is critical, because visiting the same page multiple times would lead to duplicate results. This is automatically handled by the `RequestQueue` which deduplicates requests using their `unique_key`. This `unique_key` is automatically generated from the request's URL by lowercasing the URL, lexically ordering query parameters, removing fragments and a few other tweaks that ensure the queue only includes unique URLs. + +## Advanced filtering arguments + +While the defaults for `enqueue_links` can be often exactly what you need, it also gives you fine-grained control over which URLs should be enqueued. One way we already mentioned above. It is using the `EnqueueStrategy` type alias. You can use the `all` strategy if you want to follow every single link, regardless of its domain, or you can enqueue links that target the same domain name with the `same-domain` strategy. + +```python +# Wanders the internet. +await context.enqueue_links(strategy='all') +``` + +### Filter URLs with patterns + +For even more control, you can use the `include` or `exclude` parameters, either as glob patterns or regular expressions, to filter the URLs. Refer to the API documentation for `enqueue_links` for detailed information on these and other available options. + + + {GlobsExample} + + +### Transform requests before enqueuing + +For cases where you need to modify or filter requests before they are enqueued, you can use the `transform_request_function` parameter. This function takes a `Request` object as input and should return either a modified `Request`` object or `None`. If the function returns `None`, the request will be skipped. + + + {TransformExample} + + +## Next steps + +Next, you will start your project of scraping a production website and learn some more Crawlee tricks in the process. diff --git a/website/versioned_docs/version-0.6/introduction/04_real_world_project.mdx b/website/versioned_docs/version-0.6/introduction/04_real_world_project.mdx new file mode 100644 index 0000000000..61f6435980 --- /dev/null +++ b/website/versioned_docs/version-0.6/introduction/04_real_world_project.mdx @@ -0,0 +1,159 @@ +--- +id: real-world-project +title: Real-world project +--- + +import ApiLink from '@site/src/components/ApiLink'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; + +import SanityCheckExample from '!!raw-loader!roa-loader!./code_examples/04_sanity_check.py'; + +> _Hey, guys, you know, it's cool that we can scrape the `` elements of web pages, but that's not very useful. Can we finally scrape some real data and save it somewhere in a machine-readable format? Because that's why I started reading this tutorial in the first place!_ + +We hear you, young padawan! First, learn how to crawl, you must. Only then, walk through data, you can! + +## Making a production-grade crawler + +Making a production-grade crawler is not difficult, but there are many pitfalls of scraping that can catch you off guard. So for the real world project you'll learn how to scrape an [Warehouse store example](https://warehouse-theme-metal.myshopify.com/collections) instead of the Crawlee website. It contains a list of products of different categories, and each product has its own detail page. + +The website requires JavaScript rendering, which allows us to showcase more features of Crawlee. We've also added some helpful tips that prepare you for the real-world issues that you will surely encounter when scraping at scale. + +:::tip Not interested in theory? + +If you're not interested in crawling theory, feel free to [skip to the next chapter](./crawling) and get right back to coding. + +::: + +## Drawing a plan + +Sometimes scraping is really straightforward, but most of the time, it really pays off to do a bit of research first and try to answer some of these questions: + +- How is the website structured? +- Can I scrape it only with HTTP requests (read "with some <ApiLink to="class/HttpCrawler">`HttpCrawler`</ApiLink>, e.g. <ApiLink to="class/BeautifulSoupCrawler">`BeautifulSoupCrawler`</ApiLink>")? +- Do I need a headless browser for something? +- Are there any anti-scraping protections in place? +- Do I need to parse the HTML or can I get the data otherwise, such as directly from the website's API? + +For the purposes of this tutorial, let's assume that the website cannot be scraped with <ApiLink to="class/HttpCrawler">`HttpCrawler`</ApiLink>. It actually can, but we would have to dive a bit deeper than this introductory guide allows. So for now we will make things easier for you, scrape it with <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink>, and you'll learn about headless browsers in the process. + +## Choosing the data you need + +A good first step is to figure out what data you want to scrape and where to find it. For the time being, let's just agree that we want to scrape all products from all categories available on the [all collections page of the store](https://warehouse-theme-metal.myshopify.com/collections) and for each product we want to get its: + +- URL +- Manufacturer +- SKU +- Title +- Current price +- Stock available + +You will notice that some information is available directly on the list page, but for details such as "SKU" we'll also need to open the product's detail page. + +![data to scrape](/img/getting-started/scraping-practice.jpg 'Overview of data to be scraped.') + +### The start URL(s) + +This is where you start your crawl. It's convenient to start as close to the data as possible. For example, it wouldn't make much sense to start at https://warehouse-theme-metal.myshopify.com and look for a `collections` link there, when we already know that everything we want to extract can be found at the https://warehouse-theme-metal.myshopify.com/collections page. + +## Exploring the page + +Let's take a look at the https://warehouse-theme-metal.myshopify.com/collections page more carefully. There are some **categories** on the page, and each category has a list of **items**. On some category pages, at the bottom you will notice there are links to the next pages of results. This is usually called **the pagination**. + +### Categories and sorting + +When you click the categories, you'll see that they load a page of products filtered by that category. By going through a few categories and observing the behavior, we can also observe that we can sort by different conditions (such as `Best selling`, or `Price, low to high`), but for this example, we will not be looking into those. + +:::caution Limited pagination + +Be careful, because on some websites, like [amazon.com](https://amazon.com), this is not true and the sum of products in categories is actually larger than what's available without filters. Learn more in our [tutorial on scraping websites with limited pagination](https://docs.apify.com/tutorials/scrape-paginated-sites). + +::: + +### Pagination + +The pagination of the demo Warehouse Store is simple enough. When switching between pages, you will see that the URL changes to: + +```text +https://warehouse-theme-metal.myshopify.com/collections/headphones?page=2 +``` + +Try clicking on the link to page 4. You'll see that the pagination links update and show more pages. But can you trust that this will include all pages and won't stop at some point? + +:::caution Test your assumptions + +Similarly to the issue with filters explained above, the existence of pagination does not guarantee that you can simply paginate through all the results. Always test your assumptions about pagination. Otherwise, you might miss a chunk of results, and not even know about it. + +::: + +At the time of writing the `Headphones` collection results counter showed 75 results - products. Quick count of products on one page of results makes 24. 6 rows times 4 products. This means that there are 4 pages of results. + +If you're not convinced, you can visit a page somewhere in the middle, like `https://warehouse-theme-metal.myshopify.com/collections/headphones?page=2` and see how the pagination looks there. + +## The crawling strategy + +Now that you know where to start and how to find all the collection details, let's look at the crawling process. + +1. Visit the store page containing the list of categories (our start URL). +2. Enqueue all links to all categories. +3. Enqueue all product pages from the current page. +4. Enqueue links to next pages of results. +5. Open the next page in queue. + - When it's a results list page, go to 2. + - When it's a product page, scrape the data. +6. Repeat until all results pages and all products have been processed. + +`PlaywrightCrawler` will make sure to visit the pages for you, if you provide the correct requests, and you already know how to enqueue pages, so this should be fairly easy. Nevertheless, there are few more tricks that we'd like to showcase. + +## Sanity check + +Let's check that everything is set up correctly before writing the scraping logic itself. You might realize that something in your previous analysis doesn't quite add up, or the website might not behave exactly as you expected. + +The example below creates a new crawler that visits the start URL and prints the text content of all the categories on that page. When you run the code, you will see the _very badly formatted_ content of the individual category card. + +<RunnableCodeBlock className="language-python" language="python"> + {SanityCheckExample} +</RunnableCodeBlock> + +If you're wondering how to get that `.collection-block-item` selector. We'll explain it in the next chapter on DevTools. + +## DevTools - the scraper's toolbox + +:::info DevTool choice + +We'll use Chrome DevTools here, since it's the most common browser, but feel free to use any other, they're all very similar. + +::: + +Let's open DevTools by going to https://warehouse-theme-metal.myshopify.com/collections in Chrome and then right-clicking anywhere in the page and selecting **Inspect**, or by pressing **F12** or whatever your system prefers. With DevTools, you can inspect or manipulate any aspect of the currently open web page. You can learn more about DevTools in their [official documentation](https://developer.chrome.com/docs/devtools/). + +## Selecting elements + +In the DevTools, choose the **Select an element** tool and try hovering over one of the Actor cards. + +![select an element](/img/getting-started/select-an-element.jpg 'Finding the select an element tool.') + +You'll see that you can select different elements inside the card. Instead, select the whole card, not just some of its contents, such as its title or description. + +![selected element](/img/getting-started/selected-element.jpg 'Selecting an element by hovering over it.') + +Selecting an element will highlight it in the DevTools HTML inspector. When carefully look at the elements, you'll see that there are some **classes** attached to the different HTML elements. Those are called **CSS classes**, and we can make a use of them in scraping. + +Conversely, by hovering over elements in the HTML inspector, you will see them highlight on the page. Inspect the page's structure around the collection card. You'll see that all the card's data is displayed in an `<a>` element with a `class` attribute that includes **collection-block-item**. It should now make sense how we got that `.collection-block-item` selector. It's just a way to find all elements that are annotated with the `collection-block-item`. + +It's always a good idea to double-check that you're not getting any unwanted elements with this class. To do that, go into the **Console** tab of DevTools and run: + +```ts +document.querySelectorAll('.collection-block-item'); +``` + +You will see that only the 31 collection cards will be returned, and nothing else. + +:::tip Learn more about CSS selectors and DevTools + +CSS selectors and DevTools are quite a big topic. If you want to learn more, visit the [Web scraping for beginners course](https://developers.apify.com/academy/web-scraping-for-beginners) in the Apify Academy. **It's free and open-source** โค๏ธ. + +::: + +## Next steps + +Next, you will crawl the whole store, including all the listing pages and all the product detail pages. diff --git a/website/versioned_docs/version-0.6/introduction/05_crawling.mdx b/website/versioned_docs/version-0.6/introduction/05_crawling.mdx new file mode 100644 index 0000000000..7c68662766 --- /dev/null +++ b/website/versioned_docs/version-0.6/introduction/05_crawling.mdx @@ -0,0 +1,50 @@ +--- +id: crawling +title: Crawling +--- + +import ApiLink from '@site/src/components/ApiLink'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; + +import CrawlingListingExample from '!!raw-loader!roa-loader!./code_examples/05_crawling_listing.py'; +import CrawlingDetailExample from '!!raw-loader!roa-loader!./code_examples/05_crawling_detail.py'; + +To crawl the whole [Warehouse store example](https://warehouse-theme-metal.myshopify.com/collections) and find all the data, you first need to visit all the pages with products - going through all categories available and also all the product detail pages. + +## Crawling the listing pages + +In previous lessons, you used the <ApiLink to="class/EnqueueLinksFunction">`enqueue_links`</ApiLink> function like this: + +```python +await enqueue_links() +``` + +While useful in that scenario, you need something different now. Instead of finding all the `<a href="..">` elements with links to the same hostname, you need to find only the specific ones that will take your crawler to the next page of results. Otherwise, the crawler will visit a lot of other pages that you're not interested in. Using the power of DevTools and yet another <ApiLink to="class/EnqueueLinksFunction">`enqueue_links`</ApiLink> parameter, this becomes fairly easy. + +<RunnableCodeBlock className="language-python" language="python"> + {CrawlingListingExample} +</RunnableCodeBlock> + +The code should look pretty familiar to you. It's a very simple request handler where we log the currently processed URL to the console and enqueue more links. But there are also a few new, interesting additions. Let's break it down. + +### The `selector` parameter of `enqueue_links` + +When you previously used <ApiLink to="class/EnqueueLinksFunction">`enqueue_links`</ApiLink>, you were not providing any `selector` parameter, and it was fine, because you wanted to use the default value, which is `a` - finds all `<a>` elements. But now, you need to be more specific. There are multiple `<a>` links on the `Categories` page, and you're only interested in those that will take your crawler to the available list of results. Using the DevTools, you'll find that you can select the links you need using the `.collection-block-item` selector, which selects all the elements that have the `class=collection-block-item` attribute. + +### The `label` of `enqueue_links` + +You will see `label` used often throughout Crawlee, as it's a convenient way of labelling a <ApiLink to="class/Request">`Request`</ApiLink> instance for quick identification later. You can access it with `request.label` and it's a `string`. You can name your requests any way you want. Here, we used the label `CATEGORY` to note that we're enqueueing pages that represent a category of products. The <ApiLink to="class/EnqueueLinksFunction">`enqueue_links`</ApiLink> function will add this label to all requests before enqueueing them to the <ApiLink to="class/RequestQueue">`RequestQueue`</ApiLink>. Why this is useful will become obvious in a minute. + +## Crawling the detail pages + +In a similar fashion, you need to collect all the URLs to the product detail pages, because only from there you can scrape all the data you need. The following code only repeats the concepts you already know for another set of links. + +<RunnableCodeBlock className="language-python" language="python"> + {CrawlingDetailExample} +</RunnableCodeBlock> + +The crawling code is now complete. When you run the code, you'll see the crawler visit all the listing URLs and all the detail URLs. + +## Next steps + +This concludes the Crawling lesson, because you have taught the crawler to visit all the pages it needs. Let's continue with scraping data. diff --git a/website/versioned_docs/version-0.6/introduction/06_scraping.mdx b/website/versioned_docs/version-0.6/introduction/06_scraping.mdx new file mode 100644 index 0000000000..51c86e5835 --- /dev/null +++ b/website/versioned_docs/version-0.6/introduction/06_scraping.mdx @@ -0,0 +1,155 @@ +--- +id: scraping +title: Scraping +--- + +import ApiLink from '@site/src/components/ApiLink'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; + +import ScrapingExample from '!!raw-loader!roa-loader!./code_examples/06_scraping.py'; + +In the [Real-world project](./real-world-project#choosing-the-data-you-need) chapter, you've created a list of the information you wanted to collect about the products in the example Warehouse store. Let's review that and figure out ways to access the data. + +- URL +- Manufacturer +- SKU +- Title +- Current price +- Stock available + +![data to scrape](/img/getting-started/scraping-practice.jpg 'Overview of data to be scraped.') + +## Scraping the URL and manufacturer + +Some information is lying right there in front of us without even having to touch the product detail pages. The `URL` we already have - the `context.request.url`. And by looking at it carefully, we realize that we can also extract the manufacturer from the URL (as all product urls start with `/products/<manufacturer>`). We can just split the `string` and be on our way then! + +:::info url vs loaded url + +You can use `request.loaded_url` as well. Remember the difference: `request.url` is what you enqueue, `request.loaded_url` is what gets processed (after possible redirects). + +::: + +By splitting the `request.url`, we can extract the manufacturer name directly from the URL. This is done by first splitting the URL to get the product identifier and then splitting that identifier to get the manufacturer name. + +```python +# context.request.url: +# https://warehouse-theme-metal.myshopify.com/products/sennheiser-mke-440-professional-stereo-shotgun-microphone-mke-440 + +# Split the URL and get the last part. +url_part = context.request.url.split('/').pop() +# url_part: sennheiser-mke-440-professional-stereo-shotgun-microphone-mke-440 + +# Split the last part by '-' and get the first element. +manufacturer = url_part.split('-')[0] +# manufacturer: 'sennheiser' +``` + +:::tip Storing information + +It's a matter of preference, whether to store this information separately in the resulting dataset, or not. Whoever uses the dataset can easily parse the `manufacturer` from the `URL`, so should you duplicate the data unnecessarily? Our opinion is that unless the increased data consumption would be too large to bear, it's better to make the dataset as rich as possible. For example, someone might want to filter by `manufacturer`. + +::: + +:::caution Adapt and extract + +One thing you may notice is that the `manufacturer` might have a `-` in its name. If that's the case, your best bet is extracting it from the details page instead, but it's not mandatory. At the end of the day, you should always adjust and pick the best solution for your use case, and website you are crawling. + +::: + +Now it's time to add more data to the results. Let's open one of the product detail pages, for example the [Sony XBR-950G](https://warehouse-theme-metal.myshopify.com/products/sony-xbr-65x950g-65-class-64-5-diag-bravia-4k-hdr-ultra-hd-tv) page and use our DevTools-Fu ๐Ÿฅ‹ to figure out how to get the title of the product. + +## Scraping title + +To scrape the product title from a webpage, you need to identify its location in the HTML structure. By using the element selector tool in your browser's DevTools, you can see that the title is within an `<h1>` tag, which is a common practice for important headers. This `<h1>` tag is enclosed in a `<div>` with the class product-meta. We can leverage this structure to create a combined selector `.product-meta h1`. This selector targets any `<h1>` element that is a child of an element with the class `product-meta`. + +![product title](/img/getting-started/title.jpg 'Finding product title in DevTools.') + +:::tip Verifying selectors with DevTools + +Remember that you can press CTRL+F (or CMD+F on Mac) in the **Elements** tab of DevTools to open the search bar where you can quickly search for elements using their selectors. Always verify your scraping process and assumptions using the DevTools. It's faster than changing the crawler code all the time. + +::: + +To get the title, you need to locate it using Playwright with the `.product-meta h1` selector. This selector specifically targets the `<h1>` element you need. If multiple elements match, it will throw an error, which is beneficial as it prevents returning incorrect data silently. Ensuring the accuracy of your selectors is crucial for reliable data extraction. + +```python +title = await context.page.locator('.product-meta h1').text_content() +``` + +## Scraping SKU + +Using the DevTools, you can find that the product SKU is inside a `<span>` tag with the class `product-meta__sku-number`. Since there is no other `<span>` with that class on the page, you can safely use this selector to extract the SKU. + +![product sku selector](/img/getting-started/sku.jpg 'Finding product SKU in DevTools.') + +```python +# Find the SKU element using the selector and get its text content. +sku = await context.page.locator('span.product-meta__sku-number').text_content() +``` + +## Scraping current price + +Using DevTools, you can find that the current price is within a `<span>` element tagged with the `price` class. However, it is nested alongside another `<span>` element with the `visually-hidden` class. To avoid extracting the wrong text, you can filter the elements to get the correct one using the `has_text` helper. + +![product current price selector](/img/getting-started/current-price.jpg 'Finding product current price in DevTools.') + +```python +# Locate the price element and filter out the visually hidden elements. +price_element = context.page.locator('span.price', has_text='$').first + +# Extract the text content of the price element. +current_price_string = await price_element.text_content() or '' +# current_price_string: 'Sale price$1,398.00' + +# Split the string by the '$' sign to get the numeric part. +raw_price = current_price_string.split('$')[1] +# raw_price: '1,398.00' + +# Convert the raw price string to a float after removing commas. +price = float(raw_price.replace(',', '')) +# price: 1398.00 +``` + +It might look a little complex at first glance, but let's walk through what you did. First, you locate the correct part of the `price` span by filtering for elements containing the `$` sign. This ensures that you get the actual price element. Once you have the right element, you extract its text content, which gives you a string similar to `Sale price$1,398.00`. To get the numeric value, you split this string by the `$` sign. Next, you remove any commas from the resulting numeric string and convert it to a float, allowing you to work with the price as a number. This process ensures that you accurately extract and convert the current price from the product page. + +## Scraping stock availability + +The final step is to scrape the stock availability information. There is a `<span>` with the class `product-form__inventory`, which contains the text `In stock` if the product is available. You can use the `has_text` helper to filter out the correct element. + +```python +# Locate the element that contains the text 'In stock' and filter out other elements. +in_stock_element = context.page.locator( + selector='span.product-form__inventory', + has_text='In stock', +).first + +# Check if the element exists by counting the matching elements. +in_stock = await in_stock_element.count() > 0 +``` + +For this, all that matters is whether the element exists or not. You can use the `count()` method to check if any elements match the selector. If there are, it means the product is in stock. + +## Trying it out + +You have everything that is needed, so grab your newly created scraping logic, dump it into your original request handler and see the magic happen! + +<RunnableCodeBlock className="language-python" language="python"> + {ScrapingExample} +</RunnableCodeBlock> + +When you run the crawler, you will see the crawled URLs and their scraped data printed to the console. The output will look something like this: + +```json +{ + "url": "https://warehouse-theme-metal.myshopify.com/products/sony-str-za810es-7-2-channel-hi-res-wi-fi-network-av-receiver", + "manufacturer": "sony", + "title": "Sony STR-ZA810ES 7.2-Ch Hi-Res Wi-Fi Network A/V Receiver", + "sku": "SON-692802-STR-DE", + "price": 698, + "in_stock": true +} +``` + +## Next steps + +Next, you'll see how to save the data you scraped to the disk for further processing. diff --git a/website/versioned_docs/version-0.6/introduction/07_saving_data.mdx b/website/versioned_docs/version-0.6/introduction/07_saving_data.mdx new file mode 100644 index 0000000000..adddd93af9 --- /dev/null +++ b/website/versioned_docs/version-0.6/introduction/07_saving_data.mdx @@ -0,0 +1,126 @@ +--- +id: saving-data +title: Saving data +--- + +import ApiLink from '@site/src/components/ApiLink'; +import CodeBlock from '@theme/CodeBlock'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; + +import FirstCodeExample from '!!raw-loader!./code_examples/07_first_code.py'; + +import FinalCodeExample from '!!raw-loader!roa-loader!./code_examples/07_final_code.py'; + +A data extraction job would not be complete without saving the data for later use and processing. You've come to the final and most difficult part of this tutorial so make sure to pay attention very carefully! + +## Save data to the dataset + +Crawlee provides a <ApiLink to="class/Dataset">`Dataset`</ApiLink> class, which acts as an abstraction over tabular storage, making it useful for storing scraping results. To get started: + +- Add the necessary imports: Include the <ApiLink to="class/Dataset">`Dataset`</ApiLink> and any required crawler classes at the top of your file. +- Create a Dataset instance: Use the asynchronous <ApiLink to="class/Dataset#open">`Dataset.open`</ApiLink> constructor to initialize the dataset instance within your crawler's setup. + +Here's an example: + +<CodeBlock language="python"> + {FirstCodeExample} +</CodeBlock> + +Finally, instead of logging the extracted data to stdout, we can export them to the dataset: + +```python +# ... + + @crawler.router.default_handler + async def request_handler(context: PlaywrightCrawlingContext) -> None: + # ... + + data = { + 'manufacturer': manufacturer, + 'title': title, + 'sku': sku, + 'price': price, + 'in_stock': in_stock, + } + + # Push the data to the dataset. + await dataset.push_data(data) + + # ... +``` + +### Using a context helper + +Instead of importing a new class and manually creating an instance of the dataset, you can use the context helper <ApiLink to="class/PushDataFunction">`context.push_data`</ApiLink>. Remove the dataset import and instantiation, and replace `dataset.push_data` with the following: + +```python +# ... + + @crawler.router.default_handler + async def request_handler(context: PlaywrightCrawlingContext) -> None: + # ... + + data = { + 'manufacturer': manufacturer, + 'title': title, + 'sku': sku, + 'price': price, + 'in_stock': in_stock, + } + + # Push the data to the dataset. + await context.push_data(data) + + # ... +``` + +### Final code + +And that's it. Unlike earlier, we are being serious now. That's it, you're done. The final code looks like this: + +<RunnableCodeBlock className="language-python" language="python"> + {FinalCodeExample} +</RunnableCodeBlock> + +## What `push_data` does? + +A helper <ApiLink to="class/PushDataFunction">`context.push_data`</ApiLink> saves data to the default dataset. You can provide additional arguments there like `id` or `name` to open a different dataset. Dataset is a storage designed to hold data in a format similar to a table. Each time you call <ApiLink to="class/PushDataFunction">`context.push_data`</ApiLink> or direct <ApiLink to="class/Dataset#push_data">`Dataset.push_data`</ApiLink> a new row in the table is created, with the property names serving as column titles. In the default configuration, the rows are represented as JSON files saved on your file system, but other backend storage systems can be plugged into Crawlee as well. More on that later. + +:::info Automatic dataset initialization + +Each time you start Crawlee a default <ApiLink to="class/Dataset">`Dataset`</ApiLink> is automatically created, so there's no need to initialize it or create an instance first. You can create as many datasets as you want and even give them names. For more details see the <ApiLink to="class/Dataset#open">`Dataset.open`</ApiLink> function. + +::: + +{/* TODO: mention result storage guide once it's done + +:::info Automatic dataset initialization + +Each time you start Crawlee a default <ApiLink to="class/Dataset">`Dataset`</ApiLink> is automatically created, so there's no need to initialize it or create an instance first. You can create as many datasets as you want and even give them names. For more details see the [Result storage guide](../guides/result-storage#dataset) and the `Dataset.open()` function. + +::: +*/} + +## Finding saved data + +Unless you changed the configuration that Crawlee uses locally, which would suggest that you knew what you were doing, and you didn't need this tutorial anyway, you'll find your data in the storage directory that Crawlee creates in the working directory of the running script: + +```text +{PROJECT_FOLDER}/storage/datasets/default/ +``` + +The above folder will hold all your saved data in numbered files, as they were pushed into the dataset. Each file represents one invocation of <ApiLink to="class/Dataset#push_data">`Dataset.push_data`</ApiLink> or one table row. + +{/* TODO: add mention of "Result storage guide" once it's ready: + +:::tip Single file data storage options + +If you would like to store your data in a single big file, instead of many small ones, see the [Result storage guide](../guides/result-storage#key-value-store) for Key-value stores. + +::: + +*/} + +## Next steps + +Next, you'll see some improvements that you can add to your crawler code that will make it more readable and maintainable in the long run. diff --git a/website/versioned_docs/version-0.6/introduction/08_refactoring.mdx b/website/versioned_docs/version-0.6/introduction/08_refactoring.mdx new file mode 100644 index 0000000000..a194a9e839 --- /dev/null +++ b/website/versioned_docs/version-0.6/introduction/08_refactoring.mdx @@ -0,0 +1,72 @@ +--- +id: refactoring +title: Refactoring +--- + +import ApiLink from '@site/src/components/ApiLink'; +import CodeBlock from '@theme/CodeBlock'; + +import MainExample from '!!raw-loader!./code_examples/08_main.py'; +import RoutesExample from '!!raw-loader!./code_examples/08_routes.py'; + +It may seem that the data is extracted and the crawler is done, but honestly, this is just the beginning. For the sake of brevity, we've completely omitted error handling, proxies, logging, architecture, tests, documentation and other stuff that a reliable software should have. The good thing is, error handling is mostly done by Crawlee itself, so no worries on that front, unless you need some custom magic. + +:::info Navigating automatic bot-protextion avoidance + +You might be wondering about the **anti-blocking, bot-protection avoiding stealthy features** and why we haven't highlighted them yet. The reason is straightforward: these features are **automatically used** within the default configuration, providing a smooth start without manual adjustments. + +::: + +{/* TODO: add this to the info once the relevant guide is ready + +However, the default configuration, while powerful, may not cover every scenario. + +If you want to learn more, browse the [Avoid getting blocked](../guides/avoid-blocking), [Proxy management](../guides/proxy-management) and [Session management](../guides/session-management) guides. +*/} + +To promote good coding practices, let's look at how you can use a <ApiLink to="class/Router">`Router`</ApiLink> class to better structure your crawler code. + +## Request routing + +In the following code, we've made several changes: + +- Split the code into multiple files. +- Added custom instance of <ApiLink to="class/Router">`Router`</ApiLink> to make our routing cleaner, without if clauses. +- Moved route definitions to a separate `routes.py` file. +- Simplified the `main.py` file to focus on the general structure of the crawler. + +### Routes file + +First, let's define our routes in a separate file: + +<CodeBlock className="language-python" title="src/routes.py"> + {RoutesExample} +</CodeBlock> + +### Main file + +Next, our main file becomes much simpler and cleaner: + +<CodeBlock className="language-python" title="src/main.py"> + {MainExample} +</CodeBlock> + +By structuring your code this way, you achieve better separation of concerns, making the code easier to read, manage and extend. The <ApiLink to="class/Router">`Router`</ApiLink> class keeps your routing logic clean and modular, replacing if clauses with function decorators. + +## Summary + +Refactoring your crawler code with these practices enhances readability, maintainability, and scalability. + +### Splitting your code into multiple files + +There's no reason not to split your code into multiple files and keep your logic separate. Less code in a single file means less complexity to handle at any time, which improves overall readability and maintainability. Consider further splitting the routes into separate files for even better organization. + +### Using a router to structure your crawling + +Initially, using a simple `if` / `else` statement for selecting different logic based on the crawled pages might appear more readable. However, this approach can become cumbersome with more than two types of pages, especially when the logic for each page extends over dozens or even hundreds of lines of code. + +It's good practice in any programming language to split your logic into bite-sized chunks that are easy to read and reason about. Scrolling through a thousand line long `request_handler()` where everything interacts with everything and variables can be used everywhere is not a beautiful thing to do and a pain to debug. That's why we prefer the separation of routes into their own files. + +## Next steps + +In the next and final step, you'll see how to deploy your Crawlee project to the cloud. If you used the CLI to bootstrap your project, you already have a `Dockerfile` ready, and the next section will show you how to deploy it to the [Apify platform](../deployment/apify-platform) with ease. diff --git a/website/versioned_docs/version-0.6/introduction/09_running_in_cloud.mdx b/website/versioned_docs/version-0.6/introduction/09_running_in_cloud.mdx new file mode 100644 index 0000000000..3b8bbce83c --- /dev/null +++ b/website/versioned_docs/version-0.6/introduction/09_running_in_cloud.mdx @@ -0,0 +1,101 @@ +--- +id: deployment +title: Running your crawler in the Cloud +sidebar_label: Running in the Cloud +description: Deploying Crawlee-python projects to the Apify platform +--- + +import CodeBlock from '@theme/CodeBlock'; +import MainExample from '!!raw-loader!./code_examples/09_apify_sdk.py'; + +## Apify platform + +Crawlee is developed by [**Apify**](https://apify.com), the web scraping and automation platform. You could say it is the **home of Crawlee projects**. In this section you'll see how to deploy the crawler there with just a few simple steps. You can deploy a **Crawlee** project wherever you want, but using the [**Apify platform**](https://console.apify.com) will give you the best experience. + +{/*In case you want to deploy your Crawlee project to other platforms, check out the [**Deployment**](../deployment) section.*/} + +With a few simple steps, you can convert your Crawlee project into a so-called **Actor**. Actors are serverless micro-apps that are easy to develop, run, share, and integrate. The infra, proxies, and storages are ready to go. [Learn more about Actors](https://apify.com/actors). + +{/*:::info Choosing between Crawlee CLI and Apify CLI for project setup + +We started this guide by using the Crawlee CLI to bootstrap the project - it offers the basic Crawlee templates, including a ready-made `Dockerfile`. If you know you will be deploying your project to the Apify platform, you might want to start with the Apify CLI instead. It also offers several project templates, and those are all set up to be used on the Apify platform right ahead. + +:::*/} + +## Dependencies + +Before we get started, you'll need to install two new dependencies: + +- [**Apify SDK**](https://pypi.org/project/apify/), a toolkit for working with the Apify platform. This will allow us to wire the storages (e.g. [`RequestQueue`](https://docs.apify.com/sdk/python/reference/class/RequestQueue) and [`Dataset`](https://docs.apify.com/sdk/python/reference/class/Dataset)) to the Apify cloud products. The Apify SDK, like Crawlee itself, is available as a PyPI package and can be installed with any Python package manager. To install it using [pip](https://pipx.pypa.io/), run: + + ```sh + pip install apify + ``` + +- [**Apify CLI**](https://docs.apify.com/cli/), a command-line tool that will help us with authentication and deployment. It is a [Node.js](https://nodejs.org/) package, and can be installed using any Node.js package manager. In this guide, we will use [npm](https://npmjs.com/). We will install it globally, so you can use it across all your Crawlee and Apify projects. To install it using npm, run: + + ```sh + npm install -g apify-cli + ``` + +## Logging in to the Apify platform + +The next step will be [creating your Apify account](https://console.apify.com/sign-up). Don't worry, we have a **free tier**, so you can try things out before you buy in! Once you have that, it's time to log in with the just-installed [Apify CLI](https://docs.apify.com/cli/). You will need your personal access token, which you can find at https://console.apify.com/account#/integrations. + +```sh +apify login +``` + +## Adjusting the code + +Now that you have your account set up, you will need to adjust the code a tiny bit. We will use the [Apify SDK](https://docs.apify.com/sdk/python/), which will help us to wire the Crawlee storages (like the [`RequestQueue`](https://docs.apify.com/sdk/python/reference/class/RequestQueue)) to their Apify platform counterparts - otherwise Crawlee would keep things only in memory. + +Open your `src/main.py` file, and wrap everything in your `main` function with the [`Actor`](https://docs.apify.com/sdk/python/reference/class/Actor) context manager. Your code should look like this: + +<CodeBlock className="language-python" title="src/main.py"> + {MainExample} +</CodeBlock> + +The context manager will configure Crawlee to use the Apify API instead of its default memory storage interface. It also sets up few other things, like listening to the platform events via websockets. After the body is finished, it handles graceful shutdown. + +:::info Understanding `async with Actor` behavior with environment variables + +The [`Actor`](https://docs.apify.com/sdk/python/reference/class/Actor) context manager works conditionally based on the environment variables, namely based on the `APIFY_IS_AT_HOME` env var, which is set to `true` on the Apify platform. This means that your project will remain working the same locally, but will use the Apify API when deployed to the Apify platform. + +::: + +## Initializing the project + +You will also need to initialize the project for Apify, to do that, use the Apify CLI again: + +```sh +apify init +``` + +The CLI will check the project structure and guide you through the setup process. If prompted, follow the instructions and answer the questions to configure the project correctly. For more information follow the [Apify CLI documentation](https://docs.apify.com/cli/docs). + +This will create a folder called `.actor`, and an `actor.json` file inside it - this file contains the configuration relevant to the Apify platform, namely the Actor name, version, build tag, and few other things. Check out the [relevant documentation](https://docs.apify.com/platform/actors/development/actor-definition/actor-json) to see all the different things you can set there up. + +## Ship it! + +And that's all, your project is now ready to be published on the Apify platform. You can use the Apify CLI once more to do that: + +```sh +apify push +``` + +This command will create an archive from your project, upload it to the Apify platform and initiate a Docker build. Once finished, you will get a link to your new Actor on the platform. + +## Learning more about web scraping + +:::tip Explore Apify Academy Resources + +If you want to learn more about web scraping and browser automation, check out the [Apify Academy](https://developers.apify.com/academy). It's full of courses and tutorials on the topic. From beginner to advanced. And the best thing: **It's free and open source** โค๏ธ + +{/*If you want to do one more project, checkout our tutorial on building a [HackerNews scraper using Crawlee](https://blog.apify.com/crawlee-web-scraping-tutorial/).*/} + +::: + +## Thank you! ๐ŸŽ‰ + +That's it! Thanks for reading the whole introduction and if there's anything wrong, please ๐Ÿ™ let us know on [GitHub](https://github.com/apify/crawlee-python) or in our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping! ๐Ÿ‘‹ diff --git a/website/versioned_docs/version-0.6/introduction/code_examples/02_bs.py b/website/versioned_docs/version-0.6/introduction/code_examples/02_bs.py new file mode 100644 index 0000000000..6e5ee30069 --- /dev/null +++ b/website/versioned_docs/version-0.6/introduction/code_examples/02_bs.py @@ -0,0 +1,30 @@ +import asyncio + +# Add import of crawler and crawling context. +from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext +from crawlee.storages import RequestQueue + + +async def main() -> None: + # First you create the request queue instance. + rq = await RequestQueue.open() + + # And then you add one or more requests to it. + await rq.add_request('https://crawlee.dev') + + crawler = BeautifulSoupCrawler(request_manager=rq) + + # Define a request handler and attach it to the crawler using the decorator. + @crawler.router.default_handler + async def request_handler(context: BeautifulSoupCrawlingContext) -> None: + # Extract <title> text with BeautifulSoup. + # See BeautifulSoup documentation for API docs. + url = context.request.url + title = context.soup.title.string if context.soup.title else '' + context.log.info(f'The title of {url} is: {title}.') + + await crawler.run() + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/introduction/code_examples/02_bs_better.py b/website/versioned_docs/version-0.6/introduction/code_examples/02_bs_better.py new file mode 100644 index 0000000000..1a985722b6 --- /dev/null +++ b/website/versioned_docs/version-0.6/introduction/code_examples/02_bs_better.py @@ -0,0 +1,21 @@ +import asyncio + +# You don't need to import RequestQueue anymore. +from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext + + +async def main() -> None: + crawler = BeautifulSoupCrawler() + + @crawler.router.default_handler + async def request_handler(context: BeautifulSoupCrawlingContext) -> None: + url = context.request.url + title = context.soup.title.string if context.soup.title else '' + context.log.info(f'The title of {url} is: {title}.') + + # Start the crawler with the provided URLs. + await crawler.run(['https://crawlee.dev/']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/introduction/code_examples/02_request_queue.py b/website/versioned_docs/version-0.6/introduction/code_examples/02_request_queue.py new file mode 100644 index 0000000000..e6cc5eb8c3 --- /dev/null +++ b/website/versioned_docs/version-0.6/introduction/code_examples/02_request_queue.py @@ -0,0 +1,15 @@ +import asyncio + +from crawlee.storages import RequestQueue + + +async def main() -> None: + # First you create the request queue instance. + rq = await RequestQueue.open() + + # And then you add one or more requests to it. + await rq.add_request('https://crawlee.dev') + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/introduction/code_examples/03_enqueue_strategy.py b/website/versioned_docs/version-0.6/introduction/code_examples/03_enqueue_strategy.py new file mode 100644 index 0000000000..6aff8a1fba --- /dev/null +++ b/website/versioned_docs/version-0.6/introduction/code_examples/03_enqueue_strategy.py @@ -0,0 +1,25 @@ +import asyncio + +from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext + + +async def main() -> None: + crawler = BeautifulSoupCrawler(max_requests_per_crawl=10) + + @crawler.router.default_handler + async def request_handler(context: BeautifulSoupCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url}.') + + # See the `EnqueueStrategy` type alias for more strategy options. + # highlight-next-line + await context.enqueue_links( + # highlight-next-line + strategy='same-domain', + # highlight-next-line + ) + + await crawler.run(['https://crawlee.dev/']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/introduction/code_examples/03_finding_new_links.py b/website/versioned_docs/version-0.6/introduction/code_examples/03_finding_new_links.py new file mode 100644 index 0000000000..e25af30c13 --- /dev/null +++ b/website/versioned_docs/version-0.6/introduction/code_examples/03_finding_new_links.py @@ -0,0 +1,24 @@ +import asyncio + +from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext + + +async def main() -> None: + # Let's limit our crawls to make our tests shorter and safer. + crawler = BeautifulSoupCrawler(max_requests_per_crawl=10) + + @crawler.router.default_handler + async def request_handler(context: BeautifulSoupCrawlingContext) -> None: + url = context.request.url + title = context.soup.title.string if context.soup.title else '' + context.log.info(f'The title of {url} is: {title}.') + + # The enqueue_links function is available as one of the fields of the context. + # It is also context aware, so it does not require any parameters. + await context.enqueue_links() + + await crawler.run(['https://crawlee.dev/']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/introduction/code_examples/03_globs.py b/website/versioned_docs/version-0.6/introduction/code_examples/03_globs.py new file mode 100644 index 0000000000..c2f2627d95 --- /dev/null +++ b/website/versioned_docs/version-0.6/introduction/code_examples/03_globs.py @@ -0,0 +1,29 @@ +import asyncio + +from crawlee import Glob +from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext + + +async def main() -> None: + crawler = BeautifulSoupCrawler(max_requests_per_crawl=10) + + @crawler.router.default_handler + async def request_handler(context: BeautifulSoupCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url}.') + + # Enqueue links that match the 'include' glob pattern and + # do not match the 'exclude' glob pattern. + # highlight-next-line + await context.enqueue_links( + # highlight-next-line + include=[Glob('https://someplace.com/**/cats')], + # highlight-next-line + exclude=[Glob('https://**/archive/**')], + # highlight-next-line + ) + + await crawler.run(['https://crawlee.dev/']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/introduction/code_examples/03_original_code.py b/website/versioned_docs/version-0.6/introduction/code_examples/03_original_code.py new file mode 100644 index 0000000000..976e84d562 --- /dev/null +++ b/website/versioned_docs/version-0.6/introduction/code_examples/03_original_code.py @@ -0,0 +1,19 @@ +import asyncio + +from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext + + +async def main() -> None: + crawler = BeautifulSoupCrawler() + + @crawler.router.default_handler + async def request_handler(context: BeautifulSoupCrawlingContext) -> None: + url = context.request.url + title = context.soup.title.string if context.soup.title else '' + context.log.info(f'The title of {url} is: {title}.') + + await crawler.run(['https://crawlee.dev/']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/introduction/code_examples/03_transform_request.py b/website/versioned_docs/version-0.6/introduction/code_examples/03_transform_request.py new file mode 100644 index 0000000000..5f11a1cafa --- /dev/null +++ b/website/versioned_docs/version-0.6/introduction/code_examples/03_transform_request.py @@ -0,0 +1,49 @@ +from __future__ import annotations + +import asyncio + +from crawlee import HttpHeaders, RequestOptions, RequestTransformAction +from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext + + +def transform_request( + request_options: RequestOptions, +) -> RequestOptions | RequestTransformAction: + # Skip requests to PDF files + if request_options['url'].endswith('.pdf'): + return 'skip' + + if '/docs' in request_options['url']: + # Add custom headers to requests to specific URLs + request_options['headers'] = HttpHeaders({'Custom-Header': 'value'}) + + elif '/blog' in request_options['url']: + # Add label for certain URLs + request_options['label'] = 'BLOG' + + else: + # Signal that the request should proceed without any transformation + return 'unchanged' + + return request_options + + +async def main() -> None: + crawler = BeautifulSoupCrawler(max_requests_per_crawl=10) + + @crawler.router.default_handler + async def request_handler(context: BeautifulSoupCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url}.') + + # Transform request before enqueueing + await context.enqueue_links(transform_request_function=transform_request) + + @crawler.router.handler('BLOG') + async def blog_handler(context: BeautifulSoupCrawlingContext) -> None: + context.log.info(f'Blog Processing {context.request.url}.') + + await crawler.run(['https://crawlee.dev/']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/introduction/code_examples/04_sanity_check.py b/website/versioned_docs/version-0.6/introduction/code_examples/04_sanity_check.py new file mode 100644 index 0000000000..5bfbccd27e --- /dev/null +++ b/website/versioned_docs/version-0.6/introduction/code_examples/04_sanity_check.py @@ -0,0 +1,32 @@ +import asyncio + +# Instead of BeautifulSoupCrawler let's use Playwright to be able to render JavaScript. +from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext + + +async def main() -> None: + crawler = PlaywrightCrawler() + + @crawler.router.default_handler + async def request_handler(context: PlaywrightCrawlingContext) -> None: + # Wait for the collection cards to render on the page. This ensures that + # the elements we want to interact with are present in the DOM. + await context.page.wait_for_selector('.collection-block-item') + + # Execute a function within the browser context to target the collection + # card elements and extract their text content, trimming any leading or + # trailing whitespace. + category_texts = await context.page.eval_on_selector_all( + '.collection-block-item', + '(els) => els.map(el => el.textContent.trim())', + ) + + # Log the extracted texts. + for i, text in enumerate(category_texts): + context.log.info(f'CATEGORY_{i + 1}: {text}') + + await crawler.run(['https://warehouse-theme-metal.myshopify.com/collections']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/introduction/code_examples/05_crawling_detail.py b/website/versioned_docs/version-0.6/introduction/code_examples/05_crawling_detail.py new file mode 100644 index 0000000000..a6845f23b0 --- /dev/null +++ b/website/versioned_docs/version-0.6/introduction/code_examples/05_crawling_detail.py @@ -0,0 +1,57 @@ +import asyncio + +from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext + + +async def main() -> None: + crawler = PlaywrightCrawler() + + @crawler.router.default_handler + async def request_handler(context: PlaywrightCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url}') + + # We're not processing detail pages yet, so we just pass. + if context.request.label == 'DETAIL': + pass + + # We are now on a category page. We can use this to paginate through and + # enqueue all products, as well as any subsequent pages we find. + elif context.request.label == 'CATEGORY': + # Wait for the product items to render. + await context.page.wait_for_selector('.product-item > a') + + # Enqueue links found within elements matching the provided selector. + # These links will be added to the crawling queue with the label DETAIL. + await context.enqueue_links( + selector='.product-item > a', + label='DETAIL', + ) + + # Find the "Next" button to paginate through the category pages. + next_button = await context.page.query_selector('a.pagination__next') + + # If a "Next" button is found, enqueue the next page of results. + if next_button: + await context.enqueue_links( + selector='a.pagination__next', + label='CATEGORY', + ) + + # This indicates we're on the start page with no specific label. + # On the start page, we want to enqueue all the category pages. + else: + # Wait for the collection cards to render. + await context.page.wait_for_selector('.collection-block-item') + + # Enqueue links found within elements matching the provided selector. + # These links will be added to the crawling queue with the label CATEGORY. + await context.enqueue_links( + selector='.collection-block-item', + label='CATEGORY', + ) + + await crawler.run(['https://warehouse-theme-metal.myshopify.com/collections']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/introduction/code_examples/05_crawling_listing.py b/website/versioned_docs/version-0.6/introduction/code_examples/05_crawling_listing.py new file mode 100644 index 0000000000..c9c47f57d8 --- /dev/null +++ b/website/versioned_docs/version-0.6/introduction/code_examples/05_crawling_listing.py @@ -0,0 +1,28 @@ +import asyncio + +from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext + + +async def main() -> None: + crawler = PlaywrightCrawler() + + @crawler.router.default_handler + async def request_handler(context: PlaywrightCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url}') + + # Wait for the category cards to render on the page. This ensures that + # the elements we want to interact with are present in the DOM. + await context.page.wait_for_selector('.collection-block-item') + + # Enqueue links found within elements that match the specified selector. + # These links will be added to the crawling queue with the label CATEGORY. + await context.enqueue_links( + selector='.collection-block-item', + label='CATEGORY', + ) + + await crawler.run(['https://warehouse-theme-metal.myshopify.com/collections']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/introduction/code_examples/06_scraping.py b/website/versioned_docs/version-0.6/introduction/code_examples/06_scraping.py new file mode 100644 index 0000000000..f1faf1c521 --- /dev/null +++ b/website/versioned_docs/version-0.6/introduction/code_examples/06_scraping.py @@ -0,0 +1,97 @@ +import asyncio + +from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext + + +async def main() -> None: + crawler = PlaywrightCrawler( + # Let's limit our crawls to make our tests shorter and safer. + max_requests_per_crawl=10, + ) + + @crawler.router.default_handler + async def request_handler(context: PlaywrightCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url}') + + # We're not processing detail pages yet, so we just pass. + if context.request.label == 'DETAIL': + # Split the URL and get the last part to extract the manufacturer. + url_part = context.request.url.split('/').pop() + manufacturer = url_part.split('-')[0] + + # Extract the title using the combined selector. + title = await context.page.locator('.product-meta h1').text_content() + + # Extract the SKU using its selector. + sku = await context.page.locator( + 'span.product-meta__sku-number' + ).text_content() + + # Locate the price element that contains the '$' sign and filter out + # the visually hidden elements. + price_element = context.page.locator('span.price', has_text='$').first + current_price_string = await price_element.text_content() or '' + raw_price = current_price_string.split('$')[1] + price = float(raw_price.replace(',', '')) + + # Locate the element that contains the text 'In stock' + # and filter out other elements. + in_stock_element = context.page.locator( + selector='span.product-form__inventory', + has_text='In stock', + ).first + in_stock = await in_stock_element.count() > 0 + + # Put it all together in a dictionary. + data = { + 'manufacturer': manufacturer, + 'title': title, + 'sku': sku, + 'price': price, + 'in_stock': in_stock, + } + + # Print the extracted data. + context.log.info(data) + + # We are now on a category page. We can use this to paginate through and + # enqueue all products, as well as any subsequent pages we find. + elif context.request.label == 'CATEGORY': + # Wait for the product items to render. + await context.page.wait_for_selector('.product-item > a') + + # Enqueue links found within elements matching the provided selector. + # These links will be added to the crawling queue with the label DETAIL. + await context.enqueue_links( + selector='.product-item > a', + label='DETAIL', + ) + + # Find the "Next" button to paginate through the category pages. + next_button = await context.page.query_selector('a.pagination__next') + + # If a "Next" button is found, enqueue the next page of results. + if next_button: + await context.enqueue_links( + selector='a.pagination__next', + label='CATEGORY', + ) + + # This indicates we're on the start page with no specific label. + # On the start page, we want to enqueue all the category pages. + else: + # Wait for the collection cards to render. + await context.page.wait_for_selector('.collection-block-item') + + # Enqueue links found within elements matching the provided selector. + # These links will be added to the crawling queue with the label CATEGORY. + await context.enqueue_links( + selector='.collection-block-item', + label='CATEGORY', + ) + + await crawler.run(['https://warehouse-theme-metal.myshopify.com/collections']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/introduction/code_examples/07_final_code.py b/website/versioned_docs/version-0.6/introduction/code_examples/07_final_code.py new file mode 100644 index 0000000000..a1a89167b5 --- /dev/null +++ b/website/versioned_docs/version-0.6/introduction/code_examples/07_final_code.py @@ -0,0 +1,97 @@ +import asyncio + +from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext + + +async def main() -> None: + crawler = PlaywrightCrawler( + # Let's limit our crawls to make our tests shorter and safer. + max_requests_per_crawl=10, + ) + + @crawler.router.default_handler + async def request_handler(context: PlaywrightCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url}') + + # We're not processing detail pages yet, so we just pass. + if context.request.label == 'DETAIL': + # Split the URL and get the last part to extract the manufacturer. + url_part = context.request.url.split('/').pop() + manufacturer = url_part.split('-')[0] + + # Extract the title using the combined selector. + title = await context.page.locator('.product-meta h1').text_content() + + # Extract the SKU using its selector. + sku = await context.page.locator( + 'span.product-meta__sku-number' + ).text_content() + + # Locate the price element that contains the '$' sign and filter out + # the visually hidden elements. + price_element = context.page.locator('span.price', has_text='$').first + current_price_string = await price_element.text_content() or '' + raw_price = current_price_string.split('$')[1] + price = float(raw_price.replace(',', '')) + + # Locate the element that contains the text 'In stock' and filter out + # other elements. + in_stock_element = context.page.locator( + selector='span.product-form__inventory', + has_text='In stock', + ).first + in_stock = await in_stock_element.count() > 0 + + # Put it all together in a dictionary. + data = { + 'manufacturer': manufacturer, + 'title': title, + 'sku': sku, + 'price': price, + 'in_stock': in_stock, + } + + # Push the data to the dataset. + await context.push_data(data) + + # We are now on a category page. We can use this to paginate through and + # enqueue all products, as well as any subsequent pages we find. + elif context.request.label == 'CATEGORY': + # Wait for the product items to render. + await context.page.wait_for_selector('.product-item > a') + + # Enqueue links found within elements matching the provided selector. + # These links will be added to the crawling queue with the label DETAIL. + await context.enqueue_links( + selector='.product-item > a', + label='DETAIL', + ) + + # Find the "Next" button to paginate through the category pages. + next_button = await context.page.query_selector('a.pagination__next') + + # If a "Next" button is found, enqueue the next page of results. + if next_button: + await context.enqueue_links( + selector='a.pagination__next', + label='CATEGORY', + ) + + # This indicates we're on the start page with no specific label. + # On the start page, we want to enqueue all the category pages. + else: + # Wait for the collection cards to render. + await context.page.wait_for_selector('.collection-block-item') + + # Enqueue links found within elements matching the provided selector. + # These links will be added to the crawling queue with the label CATEGORY. + await context.enqueue_links( + selector='.collection-block-item', + label='CATEGORY', + ) + + await crawler.run(['https://warehouse-theme-metal.myshopify.com/collections']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/introduction/code_examples/07_first_code.py b/website/versioned_docs/version-0.6/introduction/code_examples/07_first_code.py new file mode 100644 index 0000000000..89de967684 --- /dev/null +++ b/website/versioned_docs/version-0.6/introduction/code_examples/07_first_code.py @@ -0,0 +1,22 @@ +import asyncio + +from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext +from crawlee.storages import Dataset + +# ... + + +async def main() -> None: + crawler = PlaywrightCrawler() + dataset = await Dataset.open() + + # ... + + @crawler.router.default_handler + async def request_handler(context: PlaywrightCrawlingContext) -> None: + ... + # ... + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/introduction/code_examples/08_main.py b/website/versioned_docs/version-0.6/introduction/code_examples/08_main.py new file mode 100644 index 0000000000..09f33e3376 --- /dev/null +++ b/website/versioned_docs/version-0.6/introduction/code_examples/08_main.py @@ -0,0 +1,20 @@ +import asyncio + +from crawlee.crawlers import PlaywrightCrawler + +from .routes import router + + +async def main() -> None: + crawler = PlaywrightCrawler( + # Let's limit our crawls to make our tests shorter and safer. + max_requests_per_crawl=10, + # Provide our router instance to the crawler. + request_handler=router, + ) + + await crawler.run(['https://warehouse-theme-metal.myshopify.com/collections']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/introduction/code_examples/08_routes.py b/website/versioned_docs/version-0.6/introduction/code_examples/08_routes.py new file mode 100644 index 0000000000..58031821eb --- /dev/null +++ b/website/versioned_docs/version-0.6/introduction/code_examples/08_routes.py @@ -0,0 +1,72 @@ +from crawlee.crawlers import PlaywrightCrawlingContext +from crawlee.router import Router + +router = Router[PlaywrightCrawlingContext]() + + +@router.default_handler +async def default_handler(context: PlaywrightCrawlingContext) -> None: + # This is a fallback route which will handle the start URL. + context.log.info(f'default_handler is processing {context.request.url}') + + await context.page.wait_for_selector('.collection-block-item') + + await context.enqueue_links( + selector='.collection-block-item', + label='CATEGORY', + ) + + +@router.handler('CATEGORY') +async def category_handler(context: PlaywrightCrawlingContext) -> None: + # This replaces the context.request.label == CATEGORY branch of the if clause. + context.log.info(f'category_handler is processing {context.request.url}') + + await context.page.wait_for_selector('.product-item > a') + + await context.enqueue_links( + selector='.product-item > a', + label='DETAIL', + ) + + next_button = await context.page.query_selector('a.pagination__next') + + if next_button: + await context.enqueue_links( + selector='a.pagination__next', + label='CATEGORY', + ) + + +@router.handler('DETAIL') +async def detail_handler(context: PlaywrightCrawlingContext) -> None: + # This replaces the context.request.label == DETAIL branch of the if clause. + context.log.info(f'detail_handler is processing {context.request.url}') + + url_part = context.request.url.split('/').pop() + manufacturer = url_part.split('-')[0] + + title = await context.page.locator('.product-meta h1').text_content() + + sku = await context.page.locator('span.product-meta__sku-number').text_content() + + price_element = context.page.locator('span.price', has_text='$').first + current_price_string = await price_element.text_content() or '' + raw_price = current_price_string.split('$')[1] + price = float(raw_price.replace(',', '')) + + in_stock_element = context.page.locator( + selector='span.product-form__inventory', + has_text='In stock', + ).first + in_stock = await in_stock_element.count() > 0 + + data = { + 'manufacturer': manufacturer, + 'title': title, + 'sku': sku, + 'price': price, + 'in_stock': in_stock, + } + + await context.push_data(data) diff --git a/website/versioned_docs/version-0.6/introduction/code_examples/09_apify_sdk.py b/website/versioned_docs/version-0.6/introduction/code_examples/09_apify_sdk.py new file mode 100644 index 0000000000..fd8ceaffe7 --- /dev/null +++ b/website/versioned_docs/version-0.6/introduction/code_examples/09_apify_sdk.py @@ -0,0 +1,25 @@ +import asyncio + +# highlight-next-line +from apify import Actor + +from crawlee.crawlers import PlaywrightCrawler + +from .routes import router + + +async def main() -> None: + # highlight-next-line + async with Actor: + crawler = PlaywrightCrawler( + # Let's limit our crawls to make our tests shorter and safer. + max_requests_per_crawl=10, + # Provide our router instance to the crawler. + request_handler=router, + ) + + await crawler.run(['https://warehouse-theme-metal.myshopify.com/collections']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/introduction/code_examples/__init__.py b/website/versioned_docs/version-0.6/introduction/code_examples/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/website/versioned_docs/version-0.6/introduction/code_examples/routes.py b/website/versioned_docs/version-0.6/introduction/code_examples/routes.py new file mode 100644 index 0000000000..be20b37c81 --- /dev/null +++ b/website/versioned_docs/version-0.6/introduction/code_examples/routes.py @@ -0,0 +1,4 @@ +from crawlee.crawlers import PlaywrightCrawlingContext +from crawlee.router import Router + +router = Router[PlaywrightCrawlingContext]() diff --git a/website/versioned_docs/version-0.6/introduction/index.mdx b/website/versioned_docs/version-0.6/introduction/index.mdx new file mode 100644 index 0000000000..af37ec02c4 --- /dev/null +++ b/website/versioned_docs/version-0.6/introduction/index.mdx @@ -0,0 +1,54 @@ +--- +id: introduction +title: Introduction +--- + +import ApiLink from '@site/src/components/ApiLink'; + +Crawlee covers your crawling and scraping end-to-end and helps you **build reliable scrapers. Fast.** + +Your crawlers will appear human-like and fly under the radar of modern bot protections even with the default configuration. Crawlee gives you the tools to crawl the web for links, scrape data and persistently store it in machine-readable formats, without having to worry about the technical details. And thanks to rich configuration options, you can tweak almost any aspect of Crawlee to suit your project's needs if the default settings don't cut it. + +## What you will learn + +The goal of the introduction is to provide a step-by-step guide to the most important features of Crawlee. It will walk you through creating the simplest of crawlers that only prints text to console, all the way up to a full-featured scraper that collects links from a website and extracts data. + +## ๐Ÿ›  Features + +Why Crawlee is the preferred choice for web scraping and crawling? + +### Why use Crawlee instead of just a random HTTP library with an HTML parser? + +- Unified interface for **HTTP & headless browser** crawling. +- Automatic **parallel crawling** based on available system resources. +- Written in Python with **type hints** - enhances DX (IDE autocompletion) and reduces bugs (static type checking). +- Automatic **retries** on errors or when you are getting blocked. +- Integrated **proxy rotation** and session management. +- Configurable **request routing** - direct URLs to the appropriate handlers. +- Persistent **queue for URLs** to crawl. +- Pluggable **storage** of both tabular data and files. +- Robust **error handling**. + +### Why to use Crawlee rather than Scrapy? + +- Crawlee has out-of-the-box support for **headless browser** crawling (Playwright). +- Crawlee has a **minimalistic & elegant interface** - Set up your scraper with fewer than 10 lines of code. +- Complete **type hint** coverage. +- Based on standard **Asyncio**. + +{/* TODO: + +### ๐Ÿ‘พ HTTP crawling + +- ... +*/} + +{/* TODO: +### ๐Ÿ’ป Real browser crawling + +- ... +*/} + +## Next steps + +Next, you will install Crawlee and learn how to bootstrap projects with the prepared Crawlee templates. diff --git a/website/versioned_docs/version-0.6/pyproject.toml b/website/versioned_docs/version-0.6/pyproject.toml new file mode 100644 index 0000000000..73a756786f --- /dev/null +++ b/website/versioned_docs/version-0.6/pyproject.toml @@ -0,0 +1,9 @@ +# Line lenght different from the rest of the code to make sure that the example codes visualised on the generated +# documentation webpages are shown without vertical slider to make them more readable. + +[tool.ruff] +# Inherit all from project top configuration file. +extend = "../pyproject.toml" + +# Override just line length +line-length = 90 # Maximum possible fit to the doc webpage. Longer lines need slider. diff --git a/website/versioned_docs/version-0.6/quick-start/code_examples/beautifulsoup_crawler_example.py b/website/versioned_docs/version-0.6/quick-start/code_examples/beautifulsoup_crawler_example.py new file mode 100644 index 0000000000..2db8874c4b --- /dev/null +++ b/website/versioned_docs/version-0.6/quick-start/code_examples/beautifulsoup_crawler_example.py @@ -0,0 +1,31 @@ +import asyncio + +from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext + + +async def main() -> None: + # BeautifulSoupCrawler crawls the web using HTTP requests + # and parses HTML using the BeautifulSoup library. + crawler = BeautifulSoupCrawler(max_requests_per_crawl=10) + + # Define a request handler to process each crawled page + # and attach it to the crawler using a decorator. + @crawler.router.default_handler + async def request_handler(context: BeautifulSoupCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + # Extract relevant data from the page context. + data = { + 'url': context.request.url, + 'title': context.soup.title.string if context.soup.title else None, + } + # Store the extracted data. + await context.push_data(data) + # Extract links from the current page and add them to the crawling queue. + await context.enqueue_links() + + # Add first URL to the queue and start the crawl. + await crawler.run(['https://crawlee.dev']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/quick-start/code_examples/parsel_crawler_example.py b/website/versioned_docs/version-0.6/quick-start/code_examples/parsel_crawler_example.py new file mode 100644 index 0000000000..f8ed2a3e9c --- /dev/null +++ b/website/versioned_docs/version-0.6/quick-start/code_examples/parsel_crawler_example.py @@ -0,0 +1,31 @@ +import asyncio + +from crawlee.crawlers import ParselCrawler, ParselCrawlingContext + + +async def main() -> None: + # ParselCrawler crawls the web using HTTP requests + # and parses HTML using the Parsel library. + crawler = ParselCrawler(max_requests_per_crawl=10) + + # Define a request handler to process each crawled page + # and attach it to the crawler using a decorator. + @crawler.router.default_handler + async def request_handler(context: ParselCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + # Extract relevant data from the page context. + data = { + 'url': context.request.url, + 'title': context.selector.xpath('//title/text()').get(), + } + # Store the extracted data. + await context.push_data(data) + # Extract links from the current page and add them to the crawling queue. + await context.enqueue_links() + + # Add first URL to the queue and start the crawl. + await crawler.run(['https://crawlee.dev']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/quick-start/code_examples/playwright_crawler_example.py b/website/versioned_docs/version-0.6/quick-start/code_examples/playwright_crawler_example.py new file mode 100644 index 0000000000..1bc30ae320 --- /dev/null +++ b/website/versioned_docs/version-0.6/quick-start/code_examples/playwright_crawler_example.py @@ -0,0 +1,31 @@ +import asyncio + +from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext + + +async def main() -> None: + # PlaywrightCrawler crawls the web using a headless browser + # controlled by the Playwright library. + crawler = PlaywrightCrawler() + + # Define a request handler to process each crawled page + # and attach it to the crawler using a decorator. + @crawler.router.default_handler + async def request_handler(context: PlaywrightCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + # Extract relevant data from the page context. + data = { + 'url': context.request.url, + 'title': await context.page.title(), + } + # Store the extracted data. + await context.push_data(data) + # Extract links from the current page and add them to the crawling queue. + await context.enqueue_links() + + # Add first URL to the queue and start the crawl. + await crawler.run(['https://crawlee.dev']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/quick-start/code_examples/playwright_crawler_headful_example.py b/website/versioned_docs/version-0.6/quick-start/code_examples/playwright_crawler_headful_example.py new file mode 100644 index 0000000000..403c665e51 --- /dev/null +++ b/website/versioned_docs/version-0.6/quick-start/code_examples/playwright_crawler_headful_example.py @@ -0,0 +1,19 @@ +import asyncio + +from crawlee.crawlers import PlaywrightCrawler + + +async def main() -> None: + crawler = PlaywrightCrawler( + # Run with a visible browser window. + # highlight-next-line + headless=False, + # Switch to the Firefox browser. + browser_type='firefox', + ) + + # ... + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-0.6/quick-start/index.mdx b/website/versioned_docs/version-0.6/quick-start/index.mdx new file mode 100644 index 0000000000..6045e39951 --- /dev/null +++ b/website/versioned_docs/version-0.6/quick-start/index.mdx @@ -0,0 +1,133 @@ +--- +id: quick-start +title: Quick start +--- + +import ApiLink from '@site/src/components/ApiLink'; +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +import CodeBlock from '@theme/CodeBlock'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; + +import BeautifulsoupCrawlerExample from '!!raw-loader!roa-loader!./code_examples/beautifulsoup_crawler_example.py'; +import ParselCrawlerExample from '!!raw-loader!roa-loader!./code_examples/parsel_crawler_example.py'; +import PlaywrightCrawlerExample from '!!raw-loader!roa-loader!./code_examples/playwright_crawler_example.py'; + +import PlaywrightCrawlerHeadfulExample from '!!raw-loader!./code_examples/playwright_crawler_headful_example.py'; + +This short tutorial will help you start scraping with Crawlee in just a minute or two. For an in-depth understanding of how Crawlee works, check out the [Introduction](../introduction/index.mdx) section, which provides a comprehensive step-by-step guide to creating your first scraper. + +## Choose your crawler + +Crawlee offers the following main crawler classes: <ApiLink to="class/BeautifulSoupCrawler">`BeautifulSoupCrawler`</ApiLink>, <ApiLink to="class/ParselCrawler">`ParselCrawler`</ApiLink>, and <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink>. All crawlers share the same interface, providing maximum flexibility when switching between them. + +:::caution Minimum Python version + +Crawlee requires Python 3.9 or later. + +::: + +### BeautifulSoupCrawler + +The <ApiLink to="class/BeautifulSoupCrawler">`BeautifulSoupCrawler`</ApiLink> is a plain HTTP crawler that parses HTML using the well-known [BeautifulSoup](https://pypi.org/project/beautifulsoup4/) library. It crawls the web using an HTTP client that mimics a browser. This crawler is very fast and efficient but cannot handle JavaScript rendering. + +### ParselCrawler + +The <ApiLink to="class/ParselCrawler">`ParselCrawler`</ApiLink> is similar to the <ApiLink to="class/BeautifulSoupCrawler">`BeautifulSoupCrawler`</ApiLink> but uses the [Parsel](https://pypi.org/project/parsel/) library for HTML parsing. Parsel is a lightweight library that provides a CSS selector-based API for extracting data from HTML documents. If you are familiar with the [Scrapy](https://scrapy.org/) framework, you will feel right at home with Parsel. As with the <ApiLink to="class/BeautifulSoupCrawler">`BeautifulSoupCrawler`</ApiLink>, the <ApiLink to="class/ParselCrawler">`ParselCrawler`</ApiLink> cannot handle JavaScript rendering. + +### PlaywrightCrawler + +The <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> uses a headless browser controlled by the [Playwright](https://playwright.dev/) library. It can manage Chromium, Firefox, Webkit, and other browsers. Playwright is the successor to the [Puppeteer](https://pptr.dev/) library and is becoming the de facto standard in headless browser automation. If you need a headless browser, choose Playwright. + +## Installation + +Crawlee is available the [`crawlee`](https://pypi.org/project/crawlee/) package on PyPI. This package includes the core functionality, while additional features are available as optional extras to keep dependencies and package size minimal. + +You can install Crawlee with all features or choose only the ones you need. For installing it using the [pip](https://pip.pypa.io/en/stable/) package manager, run the following command: + +```sh +python -m pip install 'crawlee[all]' +``` + +Verify that Crawlee is successfully installed: + +```sh +python -c 'import crawlee; print(crawlee.__version__)' +``` + +If you plan to use the <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink>, you'll need to install Playwright dependencies, including the browser binaries. To do this, run the following command: + +```sh +playwright install +``` + +For detailed installation instructions, see the [Setting up](../introduction/01_setting_up.mdx) documentation page. + +## Crawling + +Run the following example to perform a recursive crawl of the Crawlee website using the selected crawler. + +<Tabs groupId="quickStart"> + <TabItem value="BeautifulSoupCrawler" label="BeautifulSoupCrawler" default> + <RunnableCodeBlock className="language-python" language="python"> + {BeautifulsoupCrawlerExample} + </RunnableCodeBlock> + </TabItem> + <TabItem value="ParselCrawler" label="ParselCrawler"> + <RunnableCodeBlock className="language-python" language="python"> + {ParselCrawlerExample} + </RunnableCodeBlock> + </TabItem> + <TabItem value="PlaywrightCrawler" label="PlaywrightCrawler"> + <RunnableCodeBlock className="language-python" language="python"> + {PlaywrightCrawlerExample} + </RunnableCodeBlock> + </TabItem> +</Tabs> + +When you run the example, you will see Crawlee automating the data extraction process in your terminal. + +{/* TODO: improve the logging and add here a sample */} + +## Running headful browser + +By default, browsers controlled by Playwright run in headless mode (without a visible window). However, you can configure the crawler to run in a headful mode, which is useful during the development phase to observe the browser's actions. You can also switch from the default Chromium browser to Firefox or WebKit. + +<CodeBlock language="python"> + {PlaywrightCrawlerHeadfulExample} +</CodeBlock> + +When you run the example code, you'll see an automated browser navigating through the Crawlee website. + +{/* TODO: add video example */} + +## Results + +By default, Crawlee stores data in the `./storage` directory within your current working directory. The results of your crawl will be saved as JSON files under `./storage/datasets/default/`. + +To view the results, you can use the `cat` command: + +```sh +cat ./storage/datasets/default/000000001.json +``` + +The JSON file will contain data similar to the following: + +```json +{ + "url": "https://crawlee.dev/", + "title": "Crawlee ยท Build reliable crawlers. Fast. | Crawlee" +} +``` + +:::tip + +If you want to change the storage directory, you can set the `CRAWLEE_STORAGE_DIR` environment variable to your preferred path. + +::: + +## Examples and further reading + +For more examples showcasing various features of Crawlee, visit the [Examples](/docs/examples) section of the documentation. To get a deeper understanding of Crawlee and its components, read the step-by-step [Introduction](../introduction/index.mdx) guide. + +[//]: # (TODO: add related links once they are ready) diff --git a/website/versioned_docs/version-0.6/upgrading/upgrading_to_v0x.md b/website/versioned_docs/version-0.6/upgrading/upgrading_to_v0x.md new file mode 100644 index 0000000000..d769d67d4c --- /dev/null +++ b/website/versioned_docs/version-0.6/upgrading/upgrading_to_v0x.md @@ -0,0 +1,170 @@ +--- +id: upgrading-to-v0x +title: Upgrading to v0.x +--- + +This page summarizes the breaking changes between Crawlee for Python zero-based versions. + +## Upgrading to v0.6 + +This section summarizes the breaking changes between v0.5.x and v0.6.0. + +### HttpCrawlerOptions + +- Removed `HttpCrawlerOptions` - which contained options from `BasicCrawlerOptions` and unique options `additional_http_error_status_codes` and `ignore_http_error_status_codes`. Both of the unique options were added to `BasicCrawlerOptions` instead. + +### HttpClient + +- The signature of the `HttpClient` class has been updated. The constructor parameters `additional_http_error_status_codes` and `ignore_http_error_status_codes` have been removed and are now only available in `BasicCrawlerOptions`. +- The method `_raise_for_error_status_code` has been removed from `HttpClient`. Its logic has been moved to the `BasicCrawler` class. + +### SessionCookies + +- Replaces the `dict` used for cookie storage in `Session.cookies` with a new `SessionCookies` class. `SessionCookies` uses `CookieJar`, which enables support for multiple domains. + +### PlaywrightCrawler and PlaywrightBrowserPlugin + +- `PlaywrightCrawler` now use a persistent browser context instead of the standard browser context. +- Added `user_data_dir` parameter for `PlaywrightCrawler` and `PlaywrightBrowserPlugin` to specify the directory for the persistent context. If not provided, a temporary directory will be created automatically. + +### Configuration + +The `Configuration` fields `chrome_executable_path`, `xvfb`, and `verbose_log` have been removed. The `chrome_executable_path` and `xvfb` fields were unused, while `verbose_log` can be replaced by setting `log_level` to `DEBUG`. + +### CLI dependencies + +CLI dependencies have been moved to optional dependencies. If you need the CLI, install `crawlee[cli]` + +### Abstract base classes + +We decided to move away from [Hungarian notation](https://en.wikipedia.org/wiki/Hungarian_notation) and remove all the `Base` prefixes from the abstract classes. It includes the following public classes: +- `BaseStorageClient` -> `StorageClient` +- `BaseBrowserController` -> `BrowserController` +- `BaseBrowserPlugin` -> `BrowserPlugin` + +### EnqueueStrategy + +The `EnqueueStrategy` has been changed from an enum to a string literal type. All its values and their meaning remain unchanged. + +## Upgrading to v0.5 + +This section summarizes the breaking changes between v0.4.x and v0.5.0. + +### Crawlers & CrawlingContexts + +- All crawler and crawling context classes have been consolidated into a single sub-package called `crawlers`. +- The affected classes include: `AbstractHttpCrawler`, `AbstractHttpParser`, `BasicCrawler`, `BasicCrawlerOptions`, `BasicCrawlingContext`, `BeautifulSoupCrawler`, `BeautifulSoupCrawlingContext`, `BeautifulSoupParserType`, `ContextPipeline`, `HttpCrawler`, `HttpCrawlerOptions`, `HttpCrawlingContext`, `HttpCrawlingResult`, `ParsedHttpCrawlingContext`, `ParselCrawler`, `ParselCrawlingContext`, `PlaywrightCrawler`, `PlaywrightCrawlingContext`, `PlaywrightPreNavCrawlingContext`. + +Example update: +```diff +- from crawlee.beautifulsoup_crawler import BeautifulSoupCrawler, BeautifulSoupCrawlingContext ++ from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext +``` + +### Storage clients + +- All storage client classes have been moved into a single sub-package called `storage_clients`. +- The affected classes include: `MemoryStorageClient`, `BaseStorageClient`. + +Example update: +```diff +- from crawlee.memory_storage_client import MemoryStorageClient ++ from crawlee.storage_clients import MemoryStorageClient +``` + +### CurlImpersonateHttpClient + +- The `CurlImpersonateHttpClient` changed its import location. + +Example update: +```diff +- from crawlee.http_clients.curl_impersonate import CurlImpersonateHttpClient ++ from crawlee.http_clients import CurlImpersonateHttpClient +``` + +### BeautifulSoupParser + +- Renamed `BeautifulSoupParser` to `BeautifulSoupParserType`. Probably used only in type hints. Please replace previous usages of `BeautifulSoupParser` by `BeautifulSoupParserType`. +- `BeautifulSoupParser` is now a new class that is used in refactored class `BeautifulSoupCrawler`. + +### Service locator + +- The `crawlee.service_container` was completely refactored and renamed to `crawlee.service_locator`. +- You can use it to set the configuration, event manager or storage client globally. Or you can pass them to your crawler instance directly and it will use the service locator under the hood. + +### Statistics + +- The `crawlee.statistics.Statistics` class do not accept an event manager as an input argument anymore. It uses the default, global one. +- If you want to set your custom event manager, do it either via the service locator or pass it to the crawler. + +### Request + +- The properties `json_` and `order_no` were removed. They were there only for the internal purpose of the memory storage client, you should not need them. + +### Request storages and loaders + +- The `request_provider` parameter of `BasicCrawler.__init__` has been renamed to `request_manager` +- The `BasicCrawler.get_request_provider` method has been renamed to `BasicCrawler.get_request_manager` and it does not accept the `id` and `name` arguments anymore + - If using a specific request queue is desired, pass it as the `request_manager` on `BasicCrawler` creation +- The `RequestProvider` interface has been renamed to `RequestManager` and moved to the `crawlee.request_loaders` package +- `RequestList` has been moved to the `crawlee.request_loaders` package +- `RequestList` does not support `.drop()`, `.reclaim_request()`, `.add_request()` and `add_requests_batched()` anymore + - It implements the new `RequestLoader` interface instead of `RequestManager` + - `RequestManagerTandem` with a `RequestQueue` should be used to enable passing a `RequestList` (or any other `RequestLoader` implementation) as a `request_manager`, `await list.to_tandem()` can be used as a shortcut + +### PlaywrightCrawler + +- The `PlaywrightPreNavigationContext` was renamed to `PlaywrightPreNavCrawlingContext`. +- The input arguments in `PlaywrightCrawler.__init__` have been renamed: + - `browser_options` is now `browser_launch_options`, + - `page_options` is now `browser_new_context_options`. +- These argument renaming changes have also been applied to `BrowserPool`, `PlaywrightBrowserPlugin`, and `PlaywrightBrowserController`. + +## Upgrading to v0.4 + +This section summarizes the breaking changes between v0.3.x and v0.4.0. + +### Request model + +- The `Request.query_params` field has been removed. Please add query parameters directly to the URL, which was possible before as well, and is now the only supported approach. +- The `Request.payload` and `Request.data` fields have been consolidated. Now, only `Request.payload` remains, and it should be used for all payload data in requests. + +### Extended unique key computation + +- The computation of `extended_unique_key` now includes HTTP headers. While this change impacts the behavior, the interface remains the same. + +## Upgrading to v0.3 + +This section summarizes the breaking changes between v0.2.x and v0.3.0. + +### Public and private interface declaration + +In previous versions, the majority of the package was fully public, including many elements intended for internal use only. With the release of v0.3, we have clearly defined the public and private interface of the package. As a result, some imports have been updated (see below). If you are importing something now designated as private, we recommend reconsidering its use or discussing your use case with us in the discussions/issues. + +Here is a list of the updated public imports: + +```diff +- from crawlee.enqueue_strategy import EnqueueStrategy ++ from crawlee import EnqueueStrategy +``` + +```diff +- from crawlee.models import Request ++ from crawlee import Request +``` + +```diff +- from crawlee.basic_crawler import Router ++ from crawlee.router import Router +``` + +### Request queue + +There were internal changes that should not affect the intended usage: + +- The unused `BaseRequestQueueClient.list_requests()` method was removed +- `RequestQueue` internals were updated to match the "Request Queue V2" implementation in Crawlee for JS + +### Service container + +A new module, `crawlee.service_container`, was added to allow management of "global instances" - currently it contains `Configuration`, `EventManager` and `BaseStorageClient`. The module also replaces the `StorageClientManager` static class. It is likely that its interface will change in the future. If your use case requires working with it, please get in touch - we'll be glad to hear any feedback. diff --git a/website/versioned_docs/version-1.6/api-packages.json b/website/versioned_docs/version-1.6/api-packages.json new file mode 100644 index 0000000000..aa6e7f46d7 --- /dev/null +++ b/website/versioned_docs/version-1.6/api-packages.json @@ -0,0 +1 @@ +[{"entryPoints":{"index":{"label":"Index","path":"src/index.ts"}},"packageRoot":".","packagePath":".","packageSlug":".","packageName":"crawlee"}] \ No newline at end of file diff --git a/website/versioned_docs/version-1.6/api-typedoc.json b/website/versioned_docs/version-1.6/api-typedoc.json new file mode 100644 index 0000000000..2835259db1 --- /dev/null +++ b/website/versioned_docs/version-1.6/api-typedoc.json @@ -0,0 +1,185004 @@ +{ + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1, + "module": "_cli", + "name": "cli", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_cli.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 24 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2, + "module": "_cli", + "name": "template_directory", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_cli.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 26 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3, + "module": "_cli", + "name": "crawler_choices", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_cli.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 30 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 4, + "module": "_cli", + "name": "http_client_choices", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_cli.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 31 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 5, + "module": "_cli", + "name": "package_manager_choices", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_cli.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 32 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 6, + "module": "_cli", + "name": "default_start_url", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_cli.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 33 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 7, + "module": "_cli", + "name": "default_enable_apify_integration", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_cli.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 34 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 8, + "module": "_cli", + "name": "default_install_project", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_cli.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 35 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Crawlee is a web scraping and browser automation library." + } + ] + }, + "decorations": [ + { + "args": ".callback(invoke_without_command=True)", + "name": "cli" + } + ], + "flags": {}, + "groups": [], + "id": 9, + "module": "_cli", + "name": "callback", + "parsedDocstring": { + "text": "Crawlee is a web scraping and browser automation library." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_cli.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 39 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Crawlee is a web scraping and browser automation library." + } + ] + }, + "flags": {}, + "id": 10, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "callback", + "parameters": [ + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 11, + "kind": 32768, + "kindString": "Parameter", + "name": "version", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Bootstrap a new Crawlee project." + } + ] + }, + "decorations": [ + { + "args": ".command()", + "name": "cli" + } + ], + "flags": {}, + "groups": [], + "id": 12, + "module": "_cli", + "name": "create", + "parsedDocstring": { + "text": "Bootstrap a new Crawlee project." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_cli.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 126 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Bootstrap a new Crawlee project." + } + ] + }, + "flags": {}, + "id": 13, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "create", + "parameters": [ + { + "defaultValue": "typer.Argument(\n default=None,\n show_default=False,\n help='The name of the project and the directory that will be created to contain it. '\n 'If none is given, you will be prompted.',\n )", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 14, + "kind": 32768, + "kindString": "Parameter", + "name": "project_name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "typer.Option(\n None,\n '--crawler-type',\n '--template',\n show_default=False,\n click_type=Choice(crawler_choices),\n help='The library that will be used for crawling in your crawler. If none is given, you will be prompted.',\n )", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 15, + "kind": 32768, + "kindString": "Parameter", + "name": "crawler_type", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "typer.Option(\n None,\n show_default=False,\n click_type=Choice(http_client_choices),\n help='The library that will be used to make HTTP requests in your crawler. '\n 'If none is given, you will be prompted.',\n )", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 16, + "kind": 32768, + "kindString": "Parameter", + "name": "http_client", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "typer.Option(\n default=None,\n show_default=False,\n click_type=Choice(package_manager_choices),\n help='Package manager to be used in the new project. If none is given, you will be prompted.',\n )", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 17, + "kind": 32768, + "kindString": "Parameter", + "name": "package_manager", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "typer.Option(\n default=None,\n show_default=False,\n metavar='[START_URL]',\n help='The URL where crawling should start. If none is given, you will be prompted.',\n )", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 18, + "kind": 32768, + "kindString": "Parameter", + "name": "start_url", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "typer.Option(\n None,\n '--apify/--no-apify',\n show_default=False,\n help='Should Apify integration be set up for you? If not given, you will be prompted.',\n )", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 19, + "kind": 32768, + "kindString": "Parameter", + "name": "enable_apify_integration", + "type": { + "name": "bool | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "bool" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "typer.Option(\n None,\n '--install/--no-install',\n show_default=False,\n help='Should the project be installed now? If not given, you will be prompted.',\n )", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 20, + "kind": 32768, + "kindString": "Parameter", + "name": "install_project", + "type": { + "name": "bool | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "bool" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The name of the metadata file for storage clients." + } + ] + }, + "flags": {}, + "groups": [], + "id": 21, + "module": "_consts", + "name": "METADATA_FILENAME", + "parsedDocstring": { + "text": "The name of the metadata file for storage clients." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_consts.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 3 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Convert a string representation of a log level to an integer log level." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 22, + "module": "_log_config", + "name": "string_to_log_level", + "parsedDocstring": { + "text": "Convert a string representation of a log level to an integer log level." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_log_config.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 41 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Convert a string representation of a log level to an integer log level." + } + ] + }, + "flags": {}, + "id": 23, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "string_to_log_level", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 24, + "kind": 32768, + "kindString": "Parameter", + "name": "level", + "type": { + "name": "LogLevel", + "type": "reference", + "target": "305" + } + } + ], + "type": { + "name": "int", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 25, + "module": "_log_config", + "name": "get_configured_log_level", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_log_config.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 57 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 26, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "get_configured_log_level", + "parameters": [], + "type": { + "name": "int", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 27, + "module": "_log_config", + "name": "configure_logger", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_log_config.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 69 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 28, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "configure_logger", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 29, + "kind": 32768, + "kindString": "Parameter", + "name": "logger", + "type": { + "name": "logging.Logger", + "type": "reference" + } + }, + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 30, + "kind": 32768, + "kindString": "Parameter", + "name": "remove_old_handlers", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 32, + "module": "_log_config", + "name": "empty_record", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_log_config.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 98 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 33, + "module": "_log_config", + "name": "__init__", + "parsedDocstring": { + "text": "Initialize a new instance.\n", + "args": { + "include_logger_name": "Include logger name at the beginning of the log line.", + "args": "Arguments passed to the parent class.", + "kwargs": "Keyword arguments passed to the parent class." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_log_config.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 100 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n" + } + ] + }, + "flags": {}, + "id": 34, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Include logger name at the beginning of the log line." + } + ] + }, + "defaultValue": "True", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 35, + "kind": 32768, + "kindString": "Parameter", + "name": "include_logger_name", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Arguments passed to the parent class." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 36, + "kind": 32768, + "kindString": "Parameter", + "name": "args", + "type": { + "name": "Any", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Keyword arguments passed to the parent class." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 37, + "kind": 32768, + "kindString": "Parameter", + "name": "kwargs", + "type": { + "name": "Any", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Format the log record nicely.\n\nThis formats the log record so that it:\n- starts with the level (colorized, and padded to 5 chars so that it is nicely aligned)\n- then has the actual log message, if it's multiline then it's nicely indented\n- then has the stringified extra log fields\n- then, if an exception is a part of the log record, prints the formatted exception." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 38, + "module": "_log_config", + "name": "format", + "parsedDocstring": { + "text": "Format the log record nicely.\n\nThis formats the log record so that it:\n- starts with the level (colorized, and padded to 5 chars so that it is nicely aligned)\n- then has the actual log message, if it's multiline then it's nicely indented\n- then has the stringified extra log fields\n- then, if an exception is a part of the log record, prints the formatted exception." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_log_config.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 124 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Format the log record nicely.\n\nThis formats the log record so that it:\n- starts with the level (colorized, and padded to 5 chars so that it is nicely aligned)\n- then has the actual log message, if it's multiline then it's nicely indented\n- then has the stringified extra log fields\n- then, if an exception is a part of the log record, prints the formatted exception." + } + ] + }, + "flags": {}, + "id": 39, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "format", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 40, + "kind": 32768, + "kindString": "Parameter", + "name": "record", + "type": { + "name": "logging.LogRecord", + "type": "reference" + } + } + ], + "type": { + "name": "str", + "type": "reference" + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Log formatter that prints out the log message nicely formatted, with colored level and stringified extra fields.\n\nIt formats the log records so that they:\n- start with the level (colorized, and padded to 5 chars so that it is nicely aligned)\n- then have the actual log message, if it's multiline then it's nicely indented\n- then have the stringified extra log fields\n- then, if an exception is a part of the log record, prints the formatted exception." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 33, + 38 + ], + "title": "Methods" + }, + { + "children": [ + 32 + ], + "title": "Properties" + } + ], + "id": 31, + "module": "_log_config", + "name": "CrawleeLogFormatter", + "parsedDocstring": { + "text": "Log formatter that prints out the log message nicely formatted, with colored level and stringified extra fields.\n\nIt formats the log records so that they:\n- start with the level (colorized, and padded to 5 chars so that it is nicely aligned)\n- then have the actual log message, if it's multiline then it's nicely indented\n- then have the stringified extra log fields\n- then, if an exception is a part of the log record, prints the formatted exception." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_log_config.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 84 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 41, + "module": "errors", + "name": "TCrawlingContext", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/errors.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 24 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Wraps an exception thrown from an user-defined error handler." + } + ] + }, + "decorations": [ + { + "args": "('Errors')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [], + "id": 42, + "module": "errors", + "name": "UserDefinedErrorHandlerError", + "parsedDocstring": { + "text": "Wraps an exception thrown from an user-defined error handler." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/errors.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 28 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedBy": [ + { + "name": "UserHandlerTimeoutError", + "target": "43", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Raised when a router fails due to user raised timeout. This is different from user-defined handler timing out." + } + ] + }, + "flags": {}, + "groups": [], + "id": 43, + "module": "errors", + "name": "UserHandlerTimeoutError", + "parsedDocstring": { + "text": "Raised when a router fails due to user raised timeout. This is different from user-defined handler timing out." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/errors.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 32 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "UserDefinedErrorHandlerError", + "target": "42", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Errors of `SessionError` type will trigger a session rotation.\n\nThis error doesn't respect the `max_request_retries` option and has a separate limit of `max_session_rotations`." + } + ] + }, + "decorations": [ + { + "args": "('Errors')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [], + "id": 44, + "module": "errors", + "name": "SessionError", + "parsedDocstring": { + "text": "Errors of `SessionError` type will trigger a session rotation.\n\nThis error doesn't respect the `max_request_retries` option and has a separate limit of `max_session_rotations`." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/errors.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 37 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedBy": [ + { + "name": "ProxyError", + "target": "51", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 46, + "module": "errors", + "name": "__init__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/errors.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 48 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 47, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 48, + "kind": 32768, + "kindString": "Parameter", + "name": "service", + "type": { + "name": "type", + "type": "reference", + "target": "981" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 49, + "kind": 32768, + "kindString": "Parameter", + "name": "new_value", + "type": { + "name": "object", + "type": "reference" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 50, + "kind": 32768, + "kindString": "Parameter", + "name": "existing_value", + "type": { + "name": "object", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Raised when attempting to reassign a service in service container that is already in use." + } + ] + }, + "decorations": [ + { + "args": "('Errors')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 46 + ], + "title": "Methods" + } + ], + "id": 45, + "module": "errors", + "name": "ServiceConflictError", + "parsedDocstring": { + "text": "Raised when attempting to reassign a service in service container that is already in use." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/errors.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 45 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Raised when a proxy is being blocked or malfunctions." + } + ] + }, + "decorations": [ + { + "args": "('Errors')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [], + "id": 51, + "module": "errors", + "name": "ProxyError", + "parsedDocstring": { + "text": "Raised when a proxy is being blocked or malfunctions." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/errors.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 56 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "SessionError", + "target": "44", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 53, + "module": "errors", + "name": "__init__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/errors.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 64 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 54, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 55, + "kind": 32768, + "kindString": "Parameter", + "name": "message", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 56, + "kind": 32768, + "kindString": "Parameter", + "name": "status_code", + "type": { + "name": "int", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "inheritedFrom": { + "name": "HttpStatusCodeError.__init__", + "target": 53, + "type": "reference" + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Raised when the response status code indicates an error." + } + ] + }, + "decorations": [ + { + "args": "('Errors')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 53 + ], + "title": "Methods" + } + ], + "id": 52, + "module": "errors", + "name": "HttpStatusCodeError", + "parsedDocstring": { + "text": "Raised when the response status code indicates an error." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/errors.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 61 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedBy": [ + { + "name": "HttpClientStatusCodeError", + "target": "57", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4419, + "module": "errors", + "name": "__init__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/errors.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 64 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 54, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 55, + "kind": 32768, + "kindString": "Parameter", + "name": "message", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 56, + "kind": 32768, + "kindString": "Parameter", + "name": "status_code", + "type": { + "name": "int", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "inheritedFrom": { + "name": "HttpStatusCodeError.__init__", + "target": 53, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "HttpStatusCodeError.__init__", + "target": 53, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Raised when the response status code indicates an client error." + } + ] + }, + "decorations": [ + { + "args": "('Errors')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 4419 + ], + "title": "Methods" + } + ], + "id": 57, + "module": "errors", + "name": "HttpClientStatusCodeError", + "parsedDocstring": { + "text": "Raised when the response status code indicates an client error." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/errors.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 71 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "HttpStatusCodeError", + "target": "52", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 59, + "module": "errors", + "name": "__init__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/errors.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 79 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 60, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 61, + "kind": 32768, + "kindString": "Parameter", + "name": "wrapped_exception", + "type": { + "name": "Exception", + "type": "reference" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 62, + "kind": 32768, + "kindString": "Parameter", + "name": "crawling_context", + "type": { + "name": "TCrawlingContext", + "type": "reference", + "target": "41" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Wraps an exception thrown from a request handler (router) and extends it with crawling context." + } + ] + }, + "decorations": [ + { + "args": "('Errors')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 59 + ], + "title": "Methods" + } + ], + "id": 58, + "module": "errors", + "name": "RequestHandlerError", + "parsedDocstring": { + "text": "Wraps an exception thrown from a request handler (router) and extends it with crawling context." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/errors.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 76 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 64, + "module": "errors", + "name": "__init__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/errors.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 92 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 65, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 66, + "kind": 32768, + "kindString": "Parameter", + "name": "wrapped_exception", + "type": { + "name": "Exception", + "type": "reference" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 67, + "kind": 32768, + "kindString": "Parameter", + "name": "crawling_context", + "type": { + "name": "BasicCrawlingContext", + "type": "reference", + "target": "504" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Wraps an exception thrown in the initialization step of a context pipeline middleware.\n\nWe may not have the complete context at this point, so only `BasicCrawlingContext` is provided." + } + ] + }, + "decorations": [ + { + "args": "('Errors')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 64 + ], + "title": "Methods" + } + ], + "id": 63, + "module": "errors", + "name": "ContextPipelineInitializationError", + "parsedDocstring": { + "text": "Wraps an exception thrown in the initialization step of a context pipeline middleware.\n\nWe may not have the complete context at this point, so only `BasicCrawlingContext` is provided." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/errors.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 86 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 69, + "module": "errors", + "name": "__init__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/errors.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 105 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 70, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 71, + "kind": 32768, + "kindString": "Parameter", + "name": "wrapped_exception", + "type": { + "name": "Exception", + "type": "reference" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 72, + "kind": 32768, + "kindString": "Parameter", + "name": "crawling_context", + "type": { + "name": "BasicCrawlingContext", + "type": "reference", + "target": "504" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Wraps an exception thrown in the finalization step of a context pipeline middleware.\n\nWe may not have the complete context at this point, so only `BasicCrawlingContext` is provided." + } + ] + }, + "decorations": [ + { + "args": "('Errors')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 69 + ], + "title": "Methods" + } + ], + "id": 68, + "module": "errors", + "name": "ContextPipelineFinalizationError", + "parsedDocstring": { + "text": "Wraps an exception thrown in the finalization step of a context pipeline middleware.\n\nWe may not have the complete context at this point, so only `BasicCrawlingContext` is provided." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/errors.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 99 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "May be thrown in the initialization phase of a middleware to signal that the request should not be processed." + } + ] + }, + "decorations": [ + { + "args": "('Errors')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [], + "id": 73, + "module": "errors", + "name": "ContextPipelineInterruptedError", + "parsedDocstring": { + "text": "May be thrown in the initialization phase of a middleware to signal that the request should not be processed." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/errors.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 112 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Raised when a request cannot be processed due to a conflict with required resources." + } + ] + }, + "decorations": [ + { + "args": "('Errors')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [], + "id": 74, + "module": "errors", + "name": "RequestCollisionError", + "parsedDocstring": { + "text": "Raised when a request cannot be processed due to a conflict with required resources." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/errors.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 117 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 75, + "module": "__init__", + "name": "__version__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/__init__.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 8 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 76, + "module": "router", + "name": "TCrawlingContext", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/router.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 15 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 77, + "module": "router", + "name": "RequestHandler", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/router.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 16 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 79, + "module": "router", + "name": "__init__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/router.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 59 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 80, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a default request handler.\n\nThe default request handler is invoked for requests that have either no label or a label for which we have\nno matching handler." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 81, + "module": "router", + "name": "default_handler", + "parsedDocstring": { + "text": "Register a default request handler.\n\nThe default request handler is invoked for requests that have either no label or a label for which we have\nno matching handler." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/router.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 63 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a default request handler.\n\nThe default request handler is invoked for requests that have either no label or a label for which we have\nno matching handler." + } + ] + }, + "flags": {}, + "id": 82, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "default_handler", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 83, + "kind": 32768, + "kindString": "Parameter", + "name": "handler", + "type": { + "name": "RequestHandler", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "TCrawlingContext", + "target": "41" + } + ], + "target": "77" + } + } + ], + "type": { + "name": "RequestHandler", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "TCrawlingContext", + "target": "41" + } + ], + "target": "77" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a request handler based on a label.\n\nThis decorator registers a request handler for a specific label. The handler will be invoked only for requests\nthat have the exact same label." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 84, + "module": "router", + "name": "handler", + "parsedDocstring": { + "text": "Register a request handler based on a label.\n\nThis decorator registers a request handler for a specific label. The handler will be invoked only for requests\nthat have the exact same label." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/router.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 76 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a request handler based on a label.\n\nThis decorator registers a request handler for a specific label. The handler will be invoked only for requests\nthat have the exact same label." + } + ] + }, + "flags": {}, + "id": 85, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "handler", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 86, + "kind": 32768, + "kindString": "Parameter", + "name": "label", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "Callable", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "[RequestHandler[TCrawlingContext]]" + }, + { + "type": "reference", + "name": "Callable", + "typeArguments": [ + { + "type": "reference", + "name": "[TCrawlingContext]" + }, + { + "type": "reference", + "name": "Awaitable" + } + ] + } + ] + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Invoke a request handler that matches the request label (or the default)." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 87, + "module": "router", + "name": "__call__", + "parsedDocstring": { + "text": "Invoke a request handler that matches the request label (or the default)." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/router.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 94 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Invoke a request handler that matches the request label (or the default)." + } + ] + }, + "flags": {}, + "id": 88, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "__call__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 89, + "kind": 32768, + "kindString": "Parameter", + "name": "context", + "type": { + "name": "TCrawlingContext", + "type": "reference", + "target": "41" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A request dispatching system that routes requests to registered handlers based on their labels.\n\nThe `Router` allows you to define and register request handlers for specific labels. When a request is received,\nthe router invokes the corresponding `request_handler` based on the request's `label`. If no matching handler\nis found, the default handler is used.\n\n### Usage\n\n```python\nfrom crawlee.crawlers import HttpCrawler, HttpCrawlingContext\nfrom crawlee.router import Router\n\nrouter = Router[HttpCrawlingContext]()\n\n\n# Handler for requests without a matching label handler\n@router.default_handler\nasync def default_handler(context: HttpCrawlingContext) -> None:\n context.log.info(f'Request without label {context.request.url} ...')\n\n\n# Handler for category requests\n@router.handler(label='category')\nasync def category_handler(context: HttpCrawlingContext) -> None:\n context.log.info(f'Category request {context.request.url} ...')\n\n\n# Handler for product requests\n@router.handler(label='product')\nasync def product_handler(context: HttpCrawlingContext) -> None:\n context.log.info(f'Product {context.request.url} ...')\n\n\nasync def main() -> None:\n crawler = HttpCrawler(request_handler=router)\n await crawler.run()" + } + ] + }, + "decorations": [ + { + "args": "('Other')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 87, + 79, + 81, + 84 + ], + "title": "Methods" + } + ], + "id": 78, + "module": "router", + "name": "Router", + "parsedDocstring": { + "text": "A request dispatching system that routes requests to registered handlers based on their labels.\n\nThe `Router` allows you to define and register request handlers for specific labels. When a request is received,\nthe router invokes the corresponding `request_handler` based on the request's `label`. If no matching handler\nis found, the default handler is used.\n\n### Usage\n\n```python\nfrom crawlee.crawlers import HttpCrawler, HttpCrawlingContext\nfrom crawlee.router import Router\n\nrouter = Router[HttpCrawlingContext]()\n\n\n# Handler for requests without a matching label handler\n@router.default_handler\nasync def default_handler(context: HttpCrawlingContext) -> None:\n context.log.info(f'Request without label {context.request.url} ...')\n\n\n# Handler for category requests\n@router.handler(label='category')\nasync def category_handler(context: HttpCrawlingContext) -> None:\n context.log.info(f'Category request {context.request.url} ...')\n\n\n# Handler for product requests\n@router.handler(label='product')\nasync def product_handler(context: HttpCrawlingContext) -> None:\n context.log.info(f'Product {context.request.url} ...')\n\n\nasync def main() -> None:\n crawler = HttpCrawler(request_handler=router)\n await crawler.run()" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/router.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 20 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 91, + "module": "_request", + "name": "UNPROCESSED", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 24 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 92, + "module": "_request", + "name": "BEFORE_NAV", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 25 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 93, + "module": "_request", + "name": "AFTER_NAV", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 26 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 94, + "module": "_request", + "name": "REQUEST_HANDLER", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 27 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 95, + "module": "_request", + "name": "DONE", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 28 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 96, + "module": "_request", + "name": "ERROR_HANDLER", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 29 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 97, + "module": "_request", + "name": "ERROR", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 30 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 98, + "module": "_request", + "name": "SKIPPED", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 31 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Crawlee-specific request handling state." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 93, + 92, + 95, + 97, + 96, + 94, + 98, + 91 + ], + "title": "Properties" + } + ], + "id": 90, + "module": "_request", + "name": "RequestState", + "parsedDocstring": { + "text": "Crawlee-specific request handling state." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 21 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum number of retries for this request. Allows to override the global `max_request_retries` option of\n`BasicCrawler`." + } + ] + }, + "flags": {}, + "groups": [], + "id": 100, + "module": "_request", + "name": "max_retries", + "parsedDocstring": { + "text": "Maximum number of retries for this request. Allows to override the global `max_request_retries` option of\n`BasicCrawler`." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 37 + } + ], + "type": { + "name": "Annotated[int | None, Field(alias='maxRetries', frozen=True)]", + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The strategy that was used for enqueuing the request." + } + ] + }, + "flags": {}, + "groups": [], + "id": 101, + "module": "_request", + "name": "enqueue_strategy", + "parsedDocstring": { + "text": "The strategy that was used for enqueuing the request." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 41 + } + ], + "type": { + "name": "Annotated[EnqueueStrategy | None, Field(alias='enqueueStrategy')]", + "type": "union", + "types": [ + { + "type": "reference", + "name": "EnqueueStrategy", + "target": "303" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Describes the request's current lifecycle state." + } + ] + }, + "flags": {}, + "groups": [], + "id": 102, + "module": "_request", + "name": "state", + "parsedDocstring": { + "text": "Describes the request's current lifecycle state." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 44 + } + ], + "type": { + "name": "RequestState", + "type": "reference", + "target": "90" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The number of finished session rotations for this request." + } + ] + }, + "flags": {}, + "groups": [], + "id": 103, + "module": "_request", + "name": "session_rotation_count", + "parsedDocstring": { + "text": "The number of finished session rotations for this request." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 47 + } + ], + "type": { + "name": "Annotated[int | None, Field(alias='sessionRotationCount')]", + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 104, + "module": "_request", + "name": "skip_navigation", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 50 + } + ], + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The last proxy tier used to process the request." + } + ] + }, + "flags": {}, + "groups": [], + "id": 105, + "module": "_request", + "name": "last_proxy_tier", + "parsedDocstring": { + "text": "The last proxy tier used to process the request." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 52 + } + ], + "type": { + "name": "Annotated[int | None, Field(alias='lastProxyTier')]", + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Indicate whether the request should be enqueued at the front of the queue." + } + ] + }, + "flags": {}, + "groups": [], + "id": 106, + "module": "_request", + "name": "forefront", + "parsedDocstring": { + "text": "Indicate whether the request should be enqueued at the front of the queue." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 55 + } + ], + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The depth of the request in the crawl tree." + } + ] + }, + "flags": {}, + "groups": [], + "id": 107, + "module": "_request", + "name": "crawl_depth", + "parsedDocstring": { + "text": "The depth of the request in the crawl tree." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 58 + } + ], + "type": { + "name": "int", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "ID of a session to which the request is bound." + } + ] + }, + "flags": {}, + "groups": [], + "id": 108, + "module": "_request", + "name": "session_id", + "parsedDocstring": { + "text": "ID of a session to which the request is bound." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 61 + } + ], + "type": { + "name": "Annotated[str | None, Field()]", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Crawlee-specific configuration stored in the `user_data`." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 107, + 101, + 106, + 105, + 100, + 108, + 103, + 104, + 102 + ], + "title": "Properties" + } + ], + "id": 99, + "module": "_request", + "name": "CrawleeRequestData", + "parsedDocstring": { + "text": "Crawlee-specific configuration stored in the `user_data`." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 34 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 110, + "module": "_request", + "name": "model_config", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 72 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 111, + "module": "_request", + "name": "__pydantic_extra__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 73 + } + ], + "type": { + "name": "dict", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "JsonSerializable" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Crawlee-specific configuration stored in the `user_data`." + } + ] + }, + "flags": {}, + "groups": [], + "id": 112, + "module": "_request", + "name": "crawlee_data", + "parsedDocstring": { + "text": "Crawlee-specific configuration stored in the `user_data`." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 75 + } + ], + "type": { + "name": "Annotated[CrawleeRequestData | None, Field(alias='__crawlee')]", + "type": "union", + "types": [ + { + "type": "reference", + "name": "CrawleeRequestData", + "target": "99" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Label used for request routing." + } + ] + }, + "flags": {}, + "groups": [], + "id": 113, + "module": "_request", + "name": "label", + "parsedDocstring": { + "text": "Label used for request routing." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 78 + } + ], + "type": { + "name": "Annotated[str | None, Field()]", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 114, + "module": "_request", + "name": "__getitem__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 81 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 115, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__getitem__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 116, + "kind": 32768, + "kindString": "Parameter", + "name": "key", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "JsonSerializable", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 117, + "module": "_request", + "name": "__setitem__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 84 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 118, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__setitem__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 119, + "kind": 32768, + "kindString": "Parameter", + "name": "key", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 120, + "kind": 32768, + "kindString": "Parameter", + "name": "value", + "type": { + "name": "JsonSerializable", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 121, + "module": "_request", + "name": "__delitem__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 93 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 122, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__delitem__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 123, + "kind": 32768, + "kindString": "Parameter", + "name": "key", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 124, + "module": "_request", + "name": "__iter__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 96 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 125, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__iter__", + "parameters": [], + "type": { + "name": "Iterator", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 126, + "module": "_request", + "name": "__len__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 99 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 127, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__len__", + "parameters": [], + "type": { + "name": "int", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 128, + "module": "_request", + "name": "__eq__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 102 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 129, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__eq__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 130, + "kind": 32768, + "kindString": "Parameter", + "name": "other", + "type": { + "name": "object", + "type": "reference" + } + } + ], + "type": { + "name": "bool", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return hash based on the model fields." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 131, + "module": "_request", + "name": "__hash__", + "parsedDocstring": { + "text": "Return hash based on the model fields." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 111 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return hash based on the model fields." + } + ] + }, + "flags": {}, + "id": 132, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__hash__", + "parameters": [], + "type": { + "name": "int", + "type": "reference" + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Represents the `user_data` part of a Request.\n\nApart from the well-known attributes (`label` and `__crawlee`), it can also contain arbitrary JSON-compatible\nvalues." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 121, + 128, + 114, + 131, + 124, + 126, + 117 + ], + "title": "Methods" + }, + { + "children": [ + 111, + 112, + 113, + 110 + ], + "title": "Properties" + } + ], + "id": 109, + "module": "_request", + "name": "UserData", + "parsedDocstring": { + "text": "Represents the `user_data` part of a Request.\n\nApart from the well-known attributes (`label` and `__crawlee`), it can also contain arbitrary JSON-compatible\nvalues." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 65 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 133, + "module": "_request", + "name": "user_data_adapter", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 117 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 135, + "module": "_request", + "name": "url", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 127 + } + ], + "type": { + "name": "Required", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 136, + "module": "_request", + "name": "method", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 128 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "HttpMethod", + "target": "300" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 137, + "module": "_request", + "name": "headers", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 129 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "HttpHeaders", + "target": "306" + }, + { + "type": "reference", + "name": "dict", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "str" + } + ] + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 138, + "module": "_request", + "name": "payload", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 130 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "HttpPayload", + "target": "301" + }, + { + "type": "reference", + "name": "str" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 139, + "module": "_request", + "name": "label", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 131 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 140, + "module": "_request", + "name": "session_id", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 132 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 141, + "module": "_request", + "name": "unique_key", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 133 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 142, + "module": "_request", + "name": "id", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 134 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 143, + "module": "_request", + "name": "keep_url_fragment", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 135 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 144, + "module": "_request", + "name": "use_extended_unique_key", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 136 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 145, + "module": "_request", + "name": "always_enqueue", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 137 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 146, + "module": "_request", + "name": "user_data", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 138 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "dict", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "JsonSerializable" + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 147, + "module": "_request", + "name": "no_retry", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 139 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 148, + "module": "_request", + "name": "enqueue_strategy", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 140 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "EnqueueStrategy", + "target": "303" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 149, + "module": "_request", + "name": "max_retries", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 141 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + ] + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Options that can be used to customize request creation.\n\nThis type exactly matches the parameters of `Request.from_url` method." + } + ] + }, + "decorations": [ + { + "args": "('Other')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 145, + 148, + 137, + 142, + 143, + 139, + 149, + 136, + 147, + 138, + 140, + 141, + 135, + 144, + 146 + ], + "title": "Properties" + } + ], + "id": 134, + "module": "_request", + "name": "RequestOptions", + "parsedDocstring": { + "text": "Options that can be used to customize request creation.\n\nThis type exactly matches the parameters of `Request.from_url` method." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 121 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 151, + "module": "_request", + "name": "model_config", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 169 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A unique key identifying the request. Two requests with the same `unique_key` are considered as pointing\nto the same URL.\n\nIf `unique_key` is not provided, then it is automatically generated by normalizing the URL.\nFor example, the URL of `HTTP://www.EXAMPLE.com/something/` will produce the `unique_key`\nof `http://www.example.com/something`.\n\nPass an arbitrary non-empty text value to the `unique_key` property to override the default behavior\nand specify which URLs shall be considered equal." + } + ] + }, + "flags": {}, + "groups": [], + "id": 152, + "module": "_request", + "name": "unique_key", + "parsedDocstring": { + "text": "A unique key identifying the request. Two requests with the same `unique_key` are considered as pointing\nto the same URL.\n\nIf `unique_key` is not provided, then it is automatically generated by normalizing the URL.\nFor example, the URL of `HTTP://www.EXAMPLE.com/something/` will produce the `unique_key`\nof `http://www.example.com/something`.\n\nPass an arbitrary non-empty text value to the `unique_key` property to override the default behavior\nand specify which URLs shall be considered equal." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 171 + } + ], + "type": { + "name": "str", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The URL of the web page to crawl. Must be a valid HTTP or HTTPS URL, and may include query parameters\nand fragments." + } + ] + }, + "flags": {}, + "groups": [], + "id": 153, + "module": "_request", + "name": "url", + "parsedDocstring": { + "text": "The URL of the web page to crawl. Must be a valid HTTP or HTTPS URL, and may include query parameters\nand fragments." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 183 + } + ], + "type": { + "name": "str", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP request method." + } + ] + }, + "flags": {}, + "groups": [], + "id": 154, + "module": "_request", + "name": "method", + "parsedDocstring": { + "text": "HTTP request method." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 187 + } + ], + "type": { + "name": "HttpMethod", + "type": "reference", + "target": "300" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP request payload." + } + ] + }, + "flags": {}, + "groups": [], + "id": 155, + "module": "_request", + "name": "payload", + "parsedDocstring": { + "text": "HTTP request payload." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 190 + } + ], + "type": { + "name": "Annotated[ HttpPayload | None, BeforeValidator(lambda v: v.encode() if isinstance(v, str) else v), PlainSerializer(lambda v: v.decode() if isinstance(v, bytes) else v), Field(frozen=True), ]", + "type": "union", + "types": [ + { + "type": "reference", + "name": "HttpPayload", + "target": "301" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Number of times the request has been retried." + } + ] + }, + "flags": {}, + "groups": [], + "id": 156, + "module": "_request", + "name": "retry_count", + "parsedDocstring": { + "text": "Number of times the request has been retried." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 230 + } + ], + "type": { + "name": "int", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If set to `True`, the request will not be retried in case of failure." + } + ] + }, + "flags": {}, + "groups": [], + "id": 157, + "module": "_request", + "name": "no_retry", + "parsedDocstring": { + "text": "If set to `True`, the request will not be retried in case of failure." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 233 + } + ], + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "URL of the web page that was loaded. This can differ from the original URL in case of redirects." + } + ] + }, + "flags": {}, + "groups": [], + "id": 158, + "module": "_request", + "name": "loaded_url", + "parsedDocstring": { + "text": "URL of the web page that was loaded. This can differ from the original URL in case of redirects." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 236 + } + ], + "type": { + "name": "Annotated[str | None, BeforeValidator(validate_http_url), Field(alias='loadedUrl')]", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Timestamp when the request was handled." + } + ] + }, + "flags": {}, + "groups": [], + "id": 159, + "module": "_request", + "name": "handled_at", + "parsedDocstring": { + "text": "Timestamp when the request was handled." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 239 + } + ], + "type": { + "name": "Annotated[datetime | None, Field(alias='handledAt')]", + "type": "union", + "types": [ + { + "type": "reference", + "name": "datetime" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a new `Request` instance from a URL.\n\nThis is recommended constructor for creating new `Request` instances. It generates a `Request` object from\na given URL with additional options to customize HTTP method, payload, unique key, and other request\nproperties. If no `unique_key` or `id` is provided, they are computed automatically based on the URL,\nmethod and payload. It depends on the `keep_url_fragment` and `use_extended_unique_key` flags.\n" + } + ] + }, + "decorations": [ + { + "name": "classmethod" + } + ], + "flags": {}, + "groups": [], + "id": 160, + "module": "_request", + "name": "from_url", + "parsedDocstring": { + "text": "Create a new `Request` instance from a URL.\n\nThis is recommended constructor for creating new `Request` instances. It generates a `Request` object from\na given URL with additional options to customize HTTP method, payload, unique key, and other request\nproperties. If no `unique_key` or `id` is provided, they are computed automatically based on the URL,\nmethod and payload. It depends on the `keep_url_fragment` and `use_extended_unique_key` flags.\n", + "args": { + "url": "The URL of the request.", + "method": "The HTTP method of the request.", + "headers": "The HTTP headers of the request.", + "payload": "The data to be sent as the request body. Typically used with 'POST' or 'PUT' requests.", + "label": "A custom label to differentiate between request types. This is stored in `user_data`, and it is\nused for request routing (different requests go to different handlers).", + "session_id": "ID of a specific `Session` to which the request will be strictly bound.\nIf the session becomes unavailable when the request is processed, a `RequestCollisionError` will be\nraised.", + "unique_key": "A unique key identifying the request. If not provided, it is automatically computed based on\nthe URL and other parameters. Requests with the same `unique_key` are treated as identical.", + "keep_url_fragment": "Determines whether the URL fragment (e.g., ``section``) should be included in\nthe `unique_key` computation. This is only relevant when `unique_key` is not provided.", + "use_extended_unique_key": "Determines whether to include the HTTP method, ID Session and payload in the\n`unique_key` computation. This is only relevant when `unique_key` is not provided.", + "always_enqueue": "If set to `True`, the request will be enqueued even if it is already present in the queue.\nUsing this is not allowed when a custom `unique_key` is also provided and will result in a `ValueError`.", + "enqueue_strategy": "The strategy that will be used for enqueuing the request.", + "max_retries": "Maximum number of retries for this request. Allows to override the global `max_request_retries`\noption of `BasicCrawler`.", + "**kwargs": "Additional request properties." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 243 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a new `Request` instance from a URL.\n\nThis is recommended constructor for creating new `Request` instances. It generates a `Request` object from\na given URL with additional options to customize HTTP method, payload, unique key, and other request\nproperties. If no `unique_key` or `id` is provided, they are computed automatically based on the URL,\nmethod and payload. It depends on the `keep_url_fragment` and `use_extended_unique_key` flags.\n" + } + ] + }, + "flags": {}, + "id": 161, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "from_url", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The URL of the request." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 162, + "kind": 32768, + "kindString": "Parameter", + "name": "url", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The HTTP method of the request." + } + ] + }, + "defaultValue": "'GET'", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 163, + "kind": 32768, + "kindString": "Parameter", + "name": "method", + "type": { + "name": "HttpMethod", + "type": "reference", + "target": "300" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The HTTP headers of the request." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 164, + "kind": 32768, + "kindString": "Parameter", + "name": "headers", + "type": { + "name": "HttpHeaders | dict[str, str] | None", + "type": "union", + "types": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "HttpHeaders", + "target": "306" + }, + { + "type": "reference", + "name": "dict", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "str" + } + ] + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The data to be sent as the request body. Typically used with 'POST' or 'PUT' requests." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 165, + "kind": 32768, + "kindString": "Parameter", + "name": "payload", + "type": { + "name": "HttpPayload | str | None", + "type": "union", + "types": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "HttpPayload", + "target": "301" + }, + { + "type": "reference", + "name": "str" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A custom label to differentiate between request types. This is stored in `user_data`, and it is\nused for request routing (different requests go to different handlers)." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 166, + "kind": 32768, + "kindString": "Parameter", + "name": "label", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "ID of a specific `Session` to which the request will be strictly bound.\nIf the session becomes unavailable when the request is processed, a `RequestCollisionError` will be\nraised." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 167, + "kind": 32768, + "kindString": "Parameter", + "name": "session_id", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A unique key identifying the request. If not provided, it is automatically computed based on\nthe URL and other parameters. Requests with the same `unique_key` are treated as identical." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 168, + "kind": 32768, + "kindString": "Parameter", + "name": "unique_key", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Determines whether the URL fragment (e.g., ``section``) should be included in\nthe `unique_key` computation. This is only relevant when `unique_key` is not provided." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 169, + "kind": 32768, + "kindString": "Parameter", + "name": "keep_url_fragment", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Determines whether to include the HTTP method, ID Session and payload in the\n`unique_key` computation. This is only relevant when `unique_key` is not provided." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 170, + "kind": 32768, + "kindString": "Parameter", + "name": "use_extended_unique_key", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "If set to `True`, the request will be enqueued even if it is already present in the queue.\nUsing this is not allowed when a custom `unique_key` is also provided and will result in a `ValueError`." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 171, + "kind": 32768, + "kindString": "Parameter", + "name": "always_enqueue", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The strategy that will be used for enqueuing the request." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 172, + "kind": 32768, + "kindString": "Parameter", + "name": "enqueue_strategy", + "type": { + "name": "EnqueueStrategy | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "EnqueueStrategy", + "target": "303" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum number of retries for this request. Allows to override the global `max_request_retries`\noption of `BasicCrawler`." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 173, + "kind": 32768, + "kindString": "Parameter", + "name": "max_retries", + "type": { + "name": "int | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 174, + "kind": 32768, + "kindString": "Parameter", + "name": "kwargs", + "type": { + "name": "Any", + "type": "reference" + } + } + ], + "type": { + "name": "Self", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the value of a specific query parameter from the URL." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 175, + "module": "_request", + "name": "get_query_param_from_url", + "parsedDocstring": { + "text": "Get the value of a specific query parameter from the URL." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 344 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the value of a specific query parameter from the URL." + } + ] + }, + "flags": {}, + "id": 176, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "get_query_param_from_url", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 177, + "kind": 32768, + "kindString": "Parameter", + "name": "param", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 178, + "kind": 32768, + "kindString": "Parameter", + "name": "default", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A string used to differentiate between arbitrary request types." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 179, + "module": "_request", + "name": "label", + "parsedDocstring": { + "text": "A string used to differentiate between arbitrary request types." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 350 + } + ], + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The ID of the bound session, if there is any." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 180, + "module": "_request", + "name": "session_id", + "parsedDocstring": { + "text": "The ID of the bound session, if there is any." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 355 + } + ], + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Crawlee-specific configuration stored in the `user_data`." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 181, + "module": "_request", + "name": "crawlee_data", + "parsedDocstring": { + "text": "Crawlee-specific configuration stored in the `user_data`." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 360 + } + ], + "type": { + "name": "CrawleeRequestData", + "type": "reference", + "target": "99" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The depth of the request in the crawl tree." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 182, + "module": "_request", + "name": "crawl_depth", + "parsedDocstring": { + "text": "The depth of the request in the crawl tree." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 369 + } + ], + "type": { + "name": "int", + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "args": ".setter", + "name": "crawl_depth" + } + ], + "flags": {}, + "groups": [], + "id": 183, + "module": "_request", + "name": "crawl_depth", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 374 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 184, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "crawl_depth", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 185, + "kind": 32768, + "kindString": "Parameter", + "name": "new_value", + "type": { + "name": "int", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Crawlee-specific request handling state." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 186, + "module": "_request", + "name": "state", + "parsedDocstring": { + "text": "Crawlee-specific request handling state." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 378 + } + ], + "type": { + "name": "RequestState", + "type": "reference", + "target": "90" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "args": ".setter", + "name": "state" + } + ], + "flags": {}, + "groups": [], + "id": 187, + "module": "_request", + "name": "state", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 383 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 188, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "state", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 189, + "kind": 32768, + "kindString": "Parameter", + "name": "new_state", + "type": { + "name": "RequestState", + "type": "reference", + "target": "90" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Crawlee-specific limit on the number of retries of the request." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 190, + "module": "_request", + "name": "max_retries", + "parsedDocstring": { + "text": "Crawlee-specific limit on the number of retries of the request." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 387 + } + ], + "type": { + "name": "int | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Crawlee-specific number of finished session rotations for the request." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 191, + "module": "_request", + "name": "session_rotation_count", + "parsedDocstring": { + "text": "Crawlee-specific number of finished session rotations for the request." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 392 + } + ], + "type": { + "name": "int | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "args": ".setter", + "name": "session_rotation_count" + } + ], + "flags": {}, + "groups": [], + "id": 192, + "module": "_request", + "name": "session_rotation_count", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 397 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 193, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "session_rotation_count", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 194, + "kind": 32768, + "kindString": "Parameter", + "name": "new_session_rotation_count", + "type": { + "name": "int", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The strategy that was used for enqueuing the request." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 195, + "module": "_request", + "name": "enqueue_strategy", + "parsedDocstring": { + "text": "The strategy that was used for enqueuing the request." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 401 + } + ], + "type": { + "name": "EnqueueStrategy", + "type": "reference", + "target": "303" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "args": ".setter", + "name": "enqueue_strategy" + } + ], + "flags": {}, + "groups": [], + "id": 196, + "module": "_request", + "name": "enqueue_strategy", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 406 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 197, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "enqueue_strategy", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 198, + "kind": 32768, + "kindString": "Parameter", + "name": "new_enqueue_strategy", + "type": { + "name": "EnqueueStrategy", + "type": "reference", + "target": "303" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The last proxy tier used to process the request." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 199, + "module": "_request", + "name": "last_proxy_tier", + "parsedDocstring": { + "text": "The last proxy tier used to process the request." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 410 + } + ], + "type": { + "name": "int | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "args": ".setter", + "name": "last_proxy_tier" + } + ], + "flags": {}, + "groups": [], + "id": 200, + "module": "_request", + "name": "last_proxy_tier", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 415 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 201, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "last_proxy_tier", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 202, + "kind": 32768, + "kindString": "Parameter", + "name": "new_value", + "type": { + "name": "int", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Indicate whether the request should be enqueued at the front of the queue." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 203, + "module": "_request", + "name": "forefront", + "parsedDocstring": { + "text": "Indicate whether the request should be enqueued at the front of the queue." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 419 + } + ], + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "args": ".setter", + "name": "forefront" + } + ], + "flags": {}, + "groups": [], + "id": 204, + "module": "_request", + "name": "forefront", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 424 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 205, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "forefront", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 206, + "kind": 32768, + "kindString": "Parameter", + "name": "new_value", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Indicates whether the request was handled." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 207, + "module": "_request", + "name": "was_already_handled", + "parsedDocstring": { + "text": "Indicates whether the request was handled." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 428 + } + ], + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Represents a request in the Crawlee framework, containing the necessary information for crawling operations.\n\nThe `Request` class is one of the core components in Crawlee, utilized by various components such as request\nproviders, HTTP clients, crawlers, and more. It encapsulates the essential data for executing web requests,\nincluding the URL, HTTP method, headers, payload, and user data. The user data allows custom information\nto be stored and persisted throughout the request lifecycle, including its retries.\n\nKey functionalities include managing the request's identifier (`id`), unique key (`unique_key`) that is used\nfor request deduplication, controlling retries, handling state management, and enabling configuration for session\nrotation and proxy handling.\n\nThe recommended way to create a new instance is by using the `Request.from_url` constructor, which automatically\ngenerates a unique key and identifier based on the URL and request parameters.\n\n### Usage\n\n```python\nfrom crawlee import Request\n\nrequest = Request.from_url('https://crawlee.dev')\n```" + } + ] + }, + "decorations": [ + { + "args": "('Storage data')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 183, + 196, + 204, + 160, + 175, + 200, + 192, + 187 + ], + "title": "Methods" + }, + { + "children": [ + 182, + 181, + 195, + 203, + 159, + 179, + 199, + 158, + 190, + 154, + 151, + 157, + 155, + 156, + 180, + 191, + 186, + 152, + 153, + 207 + ], + "title": "Properties" + } + ], + "id": 150, + "module": "_request", + "name": "Request", + "parsedDocstring": { + "text": "Represents a request in the Crawlee framework, containing the necessary information for crawling operations.\n\nThe `Request` class is one of the core components in Crawlee, utilized by various components such as request\nproviders, HTTP clients, crawlers, and more. It encapsulates the essential data for executing web requests,\nincluding the URL, HTTP method, headers, payload, and user data. The user data allows custom information\nto be stored and persisted throughout the request lifecycle, including its retries.\n\nKey functionalities include managing the request's identifier (`id`), unique key (`unique_key`) that is used\nfor request deduplication, controlling retries, handling state management, and enabling configuration for session\nrotation and proxy handling.\n\nThe recommended way to create a new instance is by using the `Request.from_url` constructor, which automatically\ngenerates a unique key and identifier based on the URL and request parameters.\n\n### Usage\n\n```python\nfrom crawlee import Request\n\nrequest = Request.from_url('https://crawlee.dev')\n```" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 145 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedBy": [ + { + "name": "RequestWithLock", + "target": "208", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The timestamp when the lock expires." + } + ] + }, + "flags": {}, + "groups": [], + "id": 209, + "module": "_request", + "name": "lock_expires_at", + "parsedDocstring": { + "text": "The timestamp when the lock expires." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 436 + } + ], + "type": { + "name": "datetime", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 4397, + "module": "_request", + "name": "model_config", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 169 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "inheritedFrom": { + "name": "Request.model_config", + "target": 151, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A unique key identifying the request. Two requests with the same `unique_key` are considered as pointing\nto the same URL.\n\nIf `unique_key` is not provided, then it is automatically generated by normalizing the URL.\nFor example, the URL of `HTTP://www.EXAMPLE.com/something/` will produce the `unique_key`\nof `http://www.example.com/something`.\n\nPass an arbitrary non-empty text value to the `unique_key` property to override the default behavior\nand specify which URLs shall be considered equal." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4398, + "module": "_request", + "name": "unique_key", + "parsedDocstring": { + "text": "A unique key identifying the request. Two requests with the same `unique_key` are considered as pointing\nto the same URL.\n\nIf `unique_key` is not provided, then it is automatically generated by normalizing the URL.\nFor example, the URL of `HTTP://www.EXAMPLE.com/something/` will produce the `unique_key`\nof `http://www.example.com/something`.\n\nPass an arbitrary non-empty text value to the `unique_key` property to override the default behavior\nand specify which URLs shall be considered equal." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 171 + } + ], + "type": { + "name": "Annotated[str, Field(alias='uniqueKey', frozen=True)]", + "type": "reference" + }, + "inheritedFrom": { + "name": "Request.unique_key", + "target": 152, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The URL of the web page to crawl. Must be a valid HTTP or HTTPS URL, and may include query parameters\nand fragments." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4399, + "module": "_request", + "name": "url", + "parsedDocstring": { + "text": "The URL of the web page to crawl. Must be a valid HTTP or HTTPS URL, and may include query parameters\nand fragments." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 183 + } + ], + "type": { + "name": "Annotated[str, BeforeValidator(validate_http_url), Field(frozen=True)]", + "type": "reference" + }, + "inheritedFrom": { + "name": "Request.url", + "target": 153, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP request method." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4400, + "module": "_request", + "name": "method", + "parsedDocstring": { + "text": "HTTP request method." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 187 + } + ], + "type": { + "name": "Annotated[HttpMethod, Field(frozen=True)]", + "type": "reference" + }, + "inheritedFrom": { + "name": "Request.method", + "target": 154, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP request payload." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4401, + "module": "_request", + "name": "payload", + "parsedDocstring": { + "text": "HTTP request payload." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 190 + } + ], + "type": { + "name": "Annotated[ HttpPayload | None, BeforeValidator(lambda v: v.encode() if isinstance(v, str) else v), PlainSerializer(lambda v: v.decode() if isinstance(v, bytes) else v), Field(frozen=True), ]", + "type": "reference" + }, + "inheritedFrom": { + "name": "Request.payload", + "target": 155, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Number of times the request has been retried." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4402, + "module": "_request", + "name": "retry_count", + "parsedDocstring": { + "text": "Number of times the request has been retried." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 230 + } + ], + "type": { + "name": "Annotated[int, Field(alias='retryCount')]", + "type": "reference" + }, + "inheritedFrom": { + "name": "Request.retry_count", + "target": 156, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If set to `True`, the request will not be retried in case of failure." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4403, + "module": "_request", + "name": "no_retry", + "parsedDocstring": { + "text": "If set to `True`, the request will not be retried in case of failure." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 233 + } + ], + "type": { + "name": "Annotated[bool, Field(alias='noRetry')]", + "type": "reference" + }, + "inheritedFrom": { + "name": "Request.no_retry", + "target": 157, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "URL of the web page that was loaded. This can differ from the original URL in case of redirects." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4404, + "module": "_request", + "name": "loaded_url", + "parsedDocstring": { + "text": "URL of the web page that was loaded. This can differ from the original URL in case of redirects." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 236 + } + ], + "type": { + "name": "Annotated[str | None, BeforeValidator(validate_http_url), Field(alias='loadedUrl')]", + "type": "reference" + }, + "inheritedFrom": { + "name": "Request.loaded_url", + "target": 158, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Timestamp when the request was handled." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4405, + "module": "_request", + "name": "handled_at", + "parsedDocstring": { + "text": "Timestamp when the request was handled." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 239 + } + ], + "type": { + "name": "Annotated[datetime | None, Field(alias='handledAt')]", + "type": "reference" + }, + "inheritedFrom": { + "name": "Request.handled_at", + "target": 159, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a new `Request` instance from a URL.\n\nThis is recommended constructor for creating new `Request` instances. It generates a `Request` object from\na given URL with additional options to customize HTTP method, payload, unique key, and other request\nproperties. If no `unique_key` or `id` is provided, they are computed automatically based on the URL,\nmethod and payload. It depends on the `keep_url_fragment` and `use_extended_unique_key` flags.\n" + } + ] + }, + "decorations": [ + { + "name": "classmethod" + } + ], + "flags": {}, + "groups": [], + "id": 4406, + "module": "_request", + "name": "from_url", + "parsedDocstring": { + "text": "Create a new `Request` instance from a URL.\n\nThis is recommended constructor for creating new `Request` instances. It generates a `Request` object from\na given URL with additional options to customize HTTP method, payload, unique key, and other request\nproperties. If no `unique_key` or `id` is provided, they are computed automatically based on the URL,\nmethod and payload. It depends on the `keep_url_fragment` and `use_extended_unique_key` flags.\n", + "args": { + "url": "The URL of the request.", + "method": "The HTTP method of the request.", + "headers": "The HTTP headers of the request.", + "payload": "The data to be sent as the request body. Typically used with 'POST' or 'PUT' requests.", + "label": "A custom label to differentiate between request types. This is stored in `user_data`, and it is\nused for request routing (different requests go to different handlers).", + "session_id": "ID of a specific `Session` to which the request will be strictly bound.\nIf the session becomes unavailable when the request is processed, a `RequestCollisionError` will be\nraised.", + "unique_key": "A unique key identifying the request. If not provided, it is automatically computed based on\nthe URL and other parameters. Requests with the same `unique_key` are treated as identical.", + "keep_url_fragment": "Determines whether the URL fragment (e.g., ``section``) should be included in\nthe `unique_key` computation. This is only relevant when `unique_key` is not provided.", + "use_extended_unique_key": "Determines whether to include the HTTP method, ID Session and payload in the\n`unique_key` computation. This is only relevant when `unique_key` is not provided.", + "always_enqueue": "If set to `True`, the request will be enqueued even if it is already present in the queue.\nUsing this is not allowed when a custom `unique_key` is also provided and will result in a `ValueError`.", + "enqueue_strategy": "The strategy that will be used for enqueuing the request.", + "max_retries": "Maximum number of retries for this request. Allows to override the global `max_request_retries`\noption of `BasicCrawler`.", + "**kwargs": "Additional request properties." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 243 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a new `Request` instance from a URL.\n\nThis is recommended constructor for creating new `Request` instances. It generates a `Request` object from\na given URL with additional options to customize HTTP method, payload, unique key, and other request\nproperties. If no `unique_key` or `id` is provided, they are computed automatically based on the URL,\nmethod and payload. It depends on the `keep_url_fragment` and `use_extended_unique_key` flags.\n" + } + ] + }, + "flags": {}, + "id": 161, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "from_url", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The URL of the request." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 162, + "kind": 32768, + "kindString": "Parameter", + "name": "url", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The HTTP method of the request." + } + ] + }, + "defaultValue": "'GET'", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 163, + "kind": 32768, + "kindString": "Parameter", + "name": "method", + "type": { + "name": "HttpMethod", + "type": "reference", + "target": "300" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The HTTP headers of the request." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 164, + "kind": 32768, + "kindString": "Parameter", + "name": "headers", + "type": { + "name": "HttpHeaders | dict[str, str] | None", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The data to be sent as the request body. Typically used with 'POST' or 'PUT' requests." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 165, + "kind": 32768, + "kindString": "Parameter", + "name": "payload", + "type": { + "name": "HttpPayload | str | None", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A custom label to differentiate between request types. This is stored in `user_data`, and it is\nused for request routing (different requests go to different handlers)." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 166, + "kind": 32768, + "kindString": "Parameter", + "name": "label", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "ID of a specific `Session` to which the request will be strictly bound.\nIf the session becomes unavailable when the request is processed, a `RequestCollisionError` will be\nraised." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 167, + "kind": 32768, + "kindString": "Parameter", + "name": "session_id", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A unique key identifying the request. If not provided, it is automatically computed based on\nthe URL and other parameters. Requests with the same `unique_key` are treated as identical." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 168, + "kind": 32768, + "kindString": "Parameter", + "name": "unique_key", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Determines whether the URL fragment (e.g., ``section``) should be included in\nthe `unique_key` computation. This is only relevant when `unique_key` is not provided." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 169, + "kind": 32768, + "kindString": "Parameter", + "name": "keep_url_fragment", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Determines whether to include the HTTP method, ID Session and payload in the\n`unique_key` computation. This is only relevant when `unique_key` is not provided." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 170, + "kind": 32768, + "kindString": "Parameter", + "name": "use_extended_unique_key", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "If set to `True`, the request will be enqueued even if it is already present in the queue.\nUsing this is not allowed when a custom `unique_key` is also provided and will result in a `ValueError`." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 171, + "kind": 32768, + "kindString": "Parameter", + "name": "always_enqueue", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The strategy that will be used for enqueuing the request." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 172, + "kind": 32768, + "kindString": "Parameter", + "name": "enqueue_strategy", + "type": { + "name": "EnqueueStrategy | None", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum number of retries for this request. Allows to override the global `max_request_retries`\noption of `BasicCrawler`." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 173, + "kind": 32768, + "kindString": "Parameter", + "name": "max_retries", + "type": { + "name": "int | None", + "type": "reference" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 174, + "kind": 32768, + "kindString": "Parameter", + "name": "kwargs", + "type": { + "name": "Any", + "type": "reference" + } + } + ], + "type": { + "name": "Self", + "type": "reference" + }, + "inheritedFrom": { + "name": "Request.from_url", + "target": 160, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "Request.from_url", + "target": 160, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the value of a specific query parameter from the URL." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4407, + "module": "_request", + "name": "get_query_param_from_url", + "parsedDocstring": { + "text": "Get the value of a specific query parameter from the URL." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 344 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the value of a specific query parameter from the URL." + } + ] + }, + "flags": {}, + "id": 176, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "get_query_param_from_url", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 177, + "kind": 32768, + "kindString": "Parameter", + "name": "param", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 178, + "kind": 32768, + "kindString": "Parameter", + "name": "default", + "type": { + "name": "str | None", + "type": "reference" + } + } + ], + "type": { + "name": "str | None", + "type": "reference" + }, + "inheritedFrom": { + "name": "Request.get_query_param_from_url", + "target": 175, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "Request.get_query_param_from_url", + "target": 175, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A string used to differentiate between arbitrary request types." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 4408, + "module": "_request", + "name": "label", + "parsedDocstring": { + "text": "A string used to differentiate between arbitrary request types." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 350 + } + ], + "type": { + "name": "str | None", + "type": "reference" + }, + "inheritedFrom": { + "name": "Request.label", + "target": 179, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The ID of the bound session, if there is any." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 4409, + "module": "_request", + "name": "session_id", + "parsedDocstring": { + "text": "The ID of the bound session, if there is any." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 355 + } + ], + "type": { + "name": "str | None", + "type": "reference" + }, + "inheritedFrom": { + "name": "Request.session_id", + "target": 180, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Crawlee-specific configuration stored in the `user_data`." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 4410, + "module": "_request", + "name": "crawlee_data", + "parsedDocstring": { + "text": "Crawlee-specific configuration stored in the `user_data`." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 360 + } + ], + "type": { + "name": "CrawleeRequestData", + "type": "reference", + "target": "99" + }, + "inheritedFrom": { + "name": "Request.crawlee_data", + "target": 181, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The depth of the request in the crawl tree." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 4411, + "module": "_request", + "name": "crawl_depth", + "parsedDocstring": { + "text": "The depth of the request in the crawl tree." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 369 + } + ], + "type": { + "name": "int", + "type": "reference" + }, + "inheritedFrom": { + "name": "Request.crawl_depth", + "target": 182, + "type": "reference" + }, + "overwrites": { + "name": "Request.crawl_depth", + "target": 183, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Crawlee-specific request handling state." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 4412, + "module": "_request", + "name": "state", + "parsedDocstring": { + "text": "Crawlee-specific request handling state." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 378 + } + ], + "type": { + "name": "RequestState", + "type": "reference", + "target": "90" + }, + "inheritedFrom": { + "name": "Request.state", + "target": 186, + "type": "reference" + }, + "overwrites": { + "name": "Request.state", + "target": 187, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Crawlee-specific limit on the number of retries of the request." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 4413, + "module": "_request", + "name": "max_retries", + "parsedDocstring": { + "text": "Crawlee-specific limit on the number of retries of the request." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 387 + } + ], + "type": { + "name": "int | None", + "type": "reference" + }, + "inheritedFrom": { + "name": "Request.max_retries", + "target": 190, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Crawlee-specific number of finished session rotations for the request." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 4414, + "module": "_request", + "name": "session_rotation_count", + "parsedDocstring": { + "text": "Crawlee-specific number of finished session rotations for the request." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 392 + } + ], + "type": { + "name": "int | None", + "type": "reference" + }, + "inheritedFrom": { + "name": "Request.session_rotation_count", + "target": 191, + "type": "reference" + }, + "overwrites": { + "name": "Request.session_rotation_count", + "target": 192, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The strategy that was used for enqueuing the request." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 4415, + "module": "_request", + "name": "enqueue_strategy", + "parsedDocstring": { + "text": "The strategy that was used for enqueuing the request." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 401 + } + ], + "type": { + "name": "EnqueueStrategy", + "type": "reference", + "target": "303" + }, + "inheritedFrom": { + "name": "Request.enqueue_strategy", + "target": 195, + "type": "reference" + }, + "overwrites": { + "name": "Request.enqueue_strategy", + "target": 196, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The last proxy tier used to process the request." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 4416, + "module": "_request", + "name": "last_proxy_tier", + "parsedDocstring": { + "text": "The last proxy tier used to process the request." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 410 + } + ], + "type": { + "name": "int | None", + "type": "reference" + }, + "inheritedFrom": { + "name": "Request.last_proxy_tier", + "target": 199, + "type": "reference" + }, + "overwrites": { + "name": "Request.last_proxy_tier", + "target": 200, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Indicate whether the request should be enqueued at the front of the queue." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 4417, + "module": "_request", + "name": "forefront", + "parsedDocstring": { + "text": "Indicate whether the request should be enqueued at the front of the queue." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 419 + } + ], + "type": { + "name": "bool", + "type": "reference" + }, + "inheritedFrom": { + "name": "Request.forefront", + "target": 203, + "type": "reference" + }, + "overwrites": { + "name": "Request.forefront", + "target": 204, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Indicates whether the request was handled." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 4418, + "module": "_request", + "name": "was_already_handled", + "parsedDocstring": { + "text": "Indicates whether the request was handled." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 428 + } + ], + "type": { + "name": "bool", + "type": "reference" + }, + "inheritedFrom": { + "name": "Request.was_already_handled", + "target": 207, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A crawling request with information about locks." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 4406, + 4407 + ], + "title": "Methods" + }, + { + "children": [ + 4411, + 4410, + 4415, + 4417, + 4405, + 4408, + 4416, + 4404, + 209, + 4413, + 4400, + 4397, + 4403, + 4401, + 4402, + 4409, + 4414, + 4412, + 4398, + 4399, + 4418 + ], + "title": "Properties" + } + ], + "id": 208, + "module": "_request", + "name": "RequestWithLock", + "parsedDocstring": { + "text": "A crawling request with information about locks." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_request.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 433 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "Request", + "target": "150", + "type": "reference" + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 210, + "module": "_service_locator", + "name": "logger", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_service_locator.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 16 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 212, + "module": "_service_locator", + "name": "global_storage_instance_manager", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_service_locator.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 26 + } + ], + "type": { + "name": "StorageInstanceManager | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "StorageInstanceManager", + "target": "3914" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 213, + "module": "_service_locator", + "name": "__init__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_service_locator.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 28 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 214, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 215, + "kind": 32768, + "kindString": "Parameter", + "name": "configuration", + "type": { + "name": "Configuration | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Configuration", + "target": "235" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 216, + "kind": 32768, + "kindString": "Parameter", + "name": "event_manager", + "type": { + "name": "EventManager | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "EventManager", + "target": "1907" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 217, + "kind": 32768, + "kindString": "Parameter", + "name": "storage_client", + "type": { + "name": "StorageClient | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "StorageClient", + "target": "2784" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the configuration." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 218, + "module": "_service_locator", + "name": "get_configuration", + "parsedDocstring": { + "text": "Get the configuration." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_service_locator.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 38 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the configuration." + } + ] + }, + "flags": {}, + "id": 219, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "get_configuration", + "parameters": [], + "type": { + "name": "Configuration", + "type": "reference", + "target": "235" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Set the configuration.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 220, + "module": "_service_locator", + "name": "set_configuration", + "parsedDocstring": { + "text": "Set the configuration.\n", + "args": { + "configuration": "The configuration to set.\n" + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_service_locator.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 46 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Set the configuration.\n" + } + ] + }, + "flags": {}, + "id": 221, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "set_configuration", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The configuration to set.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 222, + "kind": 32768, + "kindString": "Parameter", + "name": "configuration", + "type": { + "name": "Configuration", + "type": "reference", + "target": "235" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the event manager." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 223, + "module": "_service_locator", + "name": "get_event_manager", + "parsedDocstring": { + "text": "Get the event manager." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_service_locator.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 63 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the event manager." + } + ] + }, + "flags": {}, + "id": 224, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "get_event_manager", + "parameters": [], + "type": { + "name": "EventManager", + "type": "reference", + "target": "1907" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Set the event manager.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 225, + "module": "_service_locator", + "name": "set_event_manager", + "parsedDocstring": { + "text": "Set the event manager.\n", + "args": { + "event_manager": "The event manager to set.\n" + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_service_locator.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 76 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Set the event manager.\n" + } + ] + }, + "flags": {}, + "id": 226, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "set_event_manager", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The event manager to set.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 227, + "kind": 32768, + "kindString": "Parameter", + "name": "event_manager", + "type": { + "name": "EventManager", + "type": "reference", + "target": "1907" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the storage client." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 228, + "module": "_service_locator", + "name": "get_storage_client", + "parsedDocstring": { + "text": "Get the storage client." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_service_locator.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 93 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the storage client." + } + ] + }, + "flags": {}, + "id": 229, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "get_storage_client", + "parameters": [], + "type": { + "name": "StorageClient", + "type": "reference", + "target": "2784" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Set the storage client.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 230, + "module": "_service_locator", + "name": "set_storage_client", + "parsedDocstring": { + "text": "Set the storage client.\n", + "args": { + "storage_client": "The storage client to set.\n" + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_service_locator.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 106 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Set the storage client.\n" + } + ] + }, + "flags": {}, + "id": 231, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "set_storage_client", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The storage client to set.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 232, + "kind": 32768, + "kindString": "Parameter", + "name": "storage_client", + "type": { + "name": "StorageClient", + "type": "reference", + "target": "2784" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the storage instance manager. It is global manager shared by all instances of ServiceLocator." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 233, + "module": "_service_locator", + "name": "storage_instance_manager", + "parsedDocstring": { + "text": "Get the storage instance manager. It is global manager shared by all instances of ServiceLocator." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_service_locator.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 124 + } + ], + "type": { + "name": "StorageInstanceManager", + "type": "reference", + "target": "3914" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Service locator for managing the services used by Crawlee.\n\nAll services are initialized to its default value lazily." + } + ] + }, + "decorations": [ + { + "args": "('Configuration')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 213, + 218, + 223, + 228, + 220, + 225, + 230 + ], + "title": "Methods" + }, + { + "children": [ + 212, + 233 + ], + "title": "Properties" + } + ], + "id": 211, + "module": "_service_locator", + "name": "ServiceLocator", + "parsedDocstring": { + "text": "Service locator for managing the services used by Crawlee.\n\nAll services are initialized to its default value lazily." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_service_locator.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 20 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 234, + "module": "_service_locator", + "name": "service_locator", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_service_locator.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 135 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 236, + "module": "configuration", + "name": "model_config", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/configuration.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 33 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Timeout for the internal asynchronous operations." + } + ] + }, + "flags": {}, + "groups": [], + "id": 237, + "module": "configuration", + "name": "internal_timeout", + "parsedDocstring": { + "text": "Timeout for the internal asynchronous operations." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/configuration.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 35 + } + ], + "type": { + "name": "Annotated[timedelta | None, Field(alias='crawlee_internal_timeout')]", + "type": "union", + "types": [ + { + "type": "reference", + "name": "timedelta" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Specifies the path to the browser executable. Currently primarily for Playwright-based features. This option\nis passed directly to Playwright's `browser_type.launch` method as `executable_path` argument. For more details,\nrefer to the Playwright documentation:\nhttps://playwright.dev/docs/api/class-browsertype#browser-type-launch." + } + ] + }, + "flags": {}, + "groups": [], + "id": 238, + "module": "configuration", + "name": "default_browser_path", + "parsedDocstring": { + "text": "Specifies the path to the browser executable. Currently primarily for Playwright-based features. This option\nis passed directly to Playwright's `browser_type.launch` method as `executable_path` argument. For more details,\nrefer to the Playwright documentation:\nhttps://playwright.dev/docs/api/class-browsertype#browser-type-launch." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/configuration.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 38 + } + ], + "type": { + "name": "Annotated[ str | None, Field( validation_alias=AliasChoices( 'apify_default_browser_path', 'crawlee_default_browser_path', ) ), ]", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Disables the sandbox for the browser. Currently primarily for Playwright-based features. This option\nis passed directly to Playwright's `browser_type.launch` method as `chromium_sandbox`. For more details,\nrefer to the Playwright documentation:\nhttps://playwright.dev/docs/api/class-browsertype#browser-type-launch." + } + ] + }, + "flags": {}, + "groups": [], + "id": 239, + "module": "configuration", + "name": "disable_browser_sandbox", + "parsedDocstring": { + "text": "Disables the sandbox for the browser. Currently primarily for Playwright-based features. This option\nis passed directly to Playwright's `browser_type.launch` method as `chromium_sandbox`. For more details,\nrefer to the Playwright documentation:\nhttps://playwright.dev/docs/api/class-browsertype#browser-type-launch." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/configuration.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 53 + } + ], + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The logging level." + } + ] + }, + "flags": {}, + "groups": [], + "id": 240, + "module": "configuration", + "name": "log_level", + "parsedDocstring": { + "text": "The logging level." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/configuration.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 67 + } + ], + "type": { + "name": "LogLevel", + "type": "reference", + "target": "305" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Whether to purge the storage on the start. This option is utilized by the storage clients." + } + ] + }, + "flags": {}, + "groups": [], + "id": 241, + "module": "configuration", + "name": "purge_on_start", + "parsedDocstring": { + "text": "Whether to purge the storage on the start. This option is utilized by the storage clients." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/configuration.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 79 + } + ], + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Interval at which `PersistState` events are emitted. The event ensures the state persistence during\nthe crawler run. This option is utilized by the `EventManager`." + } + ] + }, + "flags": {}, + "groups": [], + "id": 242, + "module": "configuration", + "name": "persist_state_interval", + "parsedDocstring": { + "text": "Interval at which `PersistState` events are emitted. The event ensures the state persistence during\nthe crawler run. This option is utilized by the `EventManager`." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/configuration.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 90 + } + ], + "type": { + "name": "timedelta_ms", + "type": "reference", + "target": "790" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Interval at which `SystemInfo` events are emitted. The event represents the current status of the system.\nThis option is utilized by the `LocalEventManager`." + } + ] + }, + "flags": {}, + "groups": [], + "id": 243, + "module": "configuration", + "name": "system_info_interval", + "parsedDocstring": { + "text": "Interval at which `SystemInfo` events are emitted. The event represents the current status of the system.\nThis option is utilized by the `LocalEventManager`." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/configuration.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 102 + } + ], + "type": { + "name": "timedelta_ms", + "type": "reference", + "target": "790" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The maximum CPU usage ratio. If the CPU usage exceeds this value, the system is considered overloaded.\nThis option is used by the `Snapshotter`." + } + ] + }, + "flags": {}, + "groups": [], + "id": 244, + "module": "configuration", + "name": "max_used_cpu_ratio", + "parsedDocstring": { + "text": "The maximum CPU usage ratio. If the CPU usage exceeds this value, the system is considered overloaded.\nThis option is used by the `Snapshotter`." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/configuration.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 114 + } + ], + "type": { + "name": "float", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The maximum memory usage ratio. If the memory usage exceeds this ratio, it is considered overloaded.\nThis option is used by the `Snapshotter`." + } + ] + }, + "flags": {}, + "groups": [], + "id": 245, + "module": "configuration", + "name": "max_used_memory_ratio", + "parsedDocstring": { + "text": "The maximum memory usage ratio. If the memory usage exceeds this ratio, it is considered overloaded.\nThis option is used by the `Snapshotter`." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/configuration.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 126 + } + ], + "type": { + "name": "float", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The maximum event loop delay. If the event loop delay exceeds this value, it is considered overloaded.\nThis option is used by the `Snapshotter`." + } + ] + }, + "flags": {}, + "groups": [], + "id": 246, + "module": "configuration", + "name": "max_event_loop_delay", + "parsedDocstring": { + "text": "The maximum event loop delay. If the event loop delay exceeds this value, it is considered overloaded.\nThis option is used by the `Snapshotter`." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/configuration.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 138 + } + ], + "type": { + "name": "timedelta_ms", + "type": "reference", + "target": "790" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The maximum number of client errors (HTTP 429) allowed before the system is considered overloaded.\nThis option is used by the `Snapshotter`." + } + ] + }, + "flags": {}, + "groups": [], + "id": 247, + "module": "configuration", + "name": "max_client_errors", + "parsedDocstring": { + "text": "The maximum number of client errors (HTTP 429) allowed before the system is considered overloaded.\nThis option is used by the `Snapshotter`." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/configuration.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 150 + } + ], + "type": { + "name": "int", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The maximum used memory in megabytes. This option is utilized by the `Snapshotter`." + } + ] + }, + "flags": {}, + "groups": [], + "id": 248, + "module": "configuration", + "name": "memory_mbytes", + "parsedDocstring": { + "text": "The maximum used memory in megabytes. This option is utilized by the `Snapshotter`." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/configuration.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 162 + } + ], + "type": { + "name": "Annotated[ int | None, Field( validation_alias=AliasChoices( 'actor_memory_mbytes', 'apify_memory_mbytes', 'crawlee_memory_mbytes', ) ), ]", + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The maximum proportion of system memory to use. If `memory_mbytes` is not provided, this ratio is used to\ncalculate the maximum memory. This option is utilized by the `Snapshotter` and supports the dynamic system memory\nscaling." + } + ] + }, + "flags": {}, + "groups": [], + "id": 249, + "module": "configuration", + "name": "available_memory_ratio", + "parsedDocstring": { + "text": "The maximum proportion of system memory to use. If `memory_mbytes` is not provided, this ratio is used to\ncalculate the maximum memory. This option is utilized by the `Snapshotter` and supports the dynamic system memory\nscaling." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/configuration.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 174 + } + ], + "type": { + "name": "float", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The path to the storage directory. This option is utilized by the storage clients." + } + ] + }, + "flags": {}, + "groups": [], + "id": 250, + "module": "configuration", + "name": "storage_dir", + "parsedDocstring": { + "text": "The path to the storage directory. This option is utilized by the storage clients." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/configuration.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 189 + } + ], + "type": { + "name": "str", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Whether to run the browser in headless mode. Currently primarily for Playwright-based features. This option\nis passed directly to Playwright's `browser_type.launch` method as `headless`. For more details,\nrefer to the Playwright documentation:\nhttps://playwright.dev/docs/api/class-browsertype#browser-type-launch." + } + ] + }, + "flags": {}, + "groups": [], + "id": 251, + "module": "configuration", + "name": "headless", + "parsedDocstring": { + "text": "Whether to run the browser in headless mode. Currently primarily for Playwright-based features. This option\nis passed directly to Playwright's `browser_type.launch` method as `headless`. For more details,\nrefer to the Playwright documentation:\nhttps://playwright.dev/docs/api/class-browsertype#browser-type-launch." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/configuration.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 200 + } + ], + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Retrieve the global instance of the configuration.\n\nMostly for the backwards compatibility. It is recommended to use the `service_locator.get_configuration()`\ninstead." + } + ] + }, + "decorations": [ + { + "name": "classmethod" + } + ], + "flags": {}, + "groups": [], + "id": 252, + "module": "configuration", + "name": "get_global_configuration", + "parsedDocstring": { + "text": "Retrieve the global instance of the configuration.\n\nMostly for the backwards compatibility. It is recommended to use the `service_locator.get_configuration()`\ninstead." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/configuration.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 216 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Retrieve the global instance of the configuration.\n\nMostly for the backwards compatibility. It is recommended to use the `service_locator.get_configuration()`\ninstead." + } + ] + }, + "flags": {}, + "id": 253, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "get_global_configuration", + "parameters": [], + "type": { + "name": "Self", + "type": "reference" + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Configuration settings for the Crawlee project.\n\nThis class stores common configurable parameters for Crawlee. Default values are provided for all settings,\nso typically, no adjustments are necessary. However, you may modify settings for specific use cases,\nsuch as changing the default storage directory, the default storage IDs, the timeout for internal\noperations, and more.\n\nSettings can also be configured via environment variables, prefixed with `CRAWLEE_`." + } + ] + }, + "decorations": [ + { + "args": "('Configuration')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 252 + ], + "title": "Methods" + }, + { + "children": [ + 249, + 238, + 239, + 251, + 237, + 240, + 247, + 246, + 244, + 245, + 248, + 236, + 242, + 241, + 250, + 243 + ], + "title": "Properties" + } + ], + "id": 235, + "module": "configuration", + "name": "Configuration", + "parsedDocstring": { + "text": "Configuration settings for the Crawlee project.\n\nThis class stores common configurable parameters for Crawlee. Default values are provided for all settings,\nso typically, no adjustments are necessary. However, you may modify settings for specific use cases,\nsuch as changing the default storage directory, the default storage IDs, the timeout for internal\noperations, and more.\n\nSettings can also be configured via environment variables, prefixed with `CRAWLEE_`." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/configuration.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 20 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The URL of the proxy." + } + ] + }, + "flags": {}, + "groups": [], + "id": 255, + "module": "proxy_configuration", + "name": "url", + "parsedDocstring": { + "text": "The URL of the proxy." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/proxy_configuration.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 29 + } + ], + "type": { + "name": "str", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The scheme of the proxy." + } + ] + }, + "flags": {}, + "groups": [], + "id": 256, + "module": "proxy_configuration", + "name": "scheme", + "parsedDocstring": { + "text": "The scheme of the proxy." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/proxy_configuration.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 32 + } + ], + "type": { + "name": "str", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The hostname of the proxy." + } + ] + }, + "flags": {}, + "groups": [], + "id": 257, + "module": "proxy_configuration", + "name": "hostname", + "parsedDocstring": { + "text": "The hostname of the proxy." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/proxy_configuration.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 35 + } + ], + "type": { + "name": "str", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The proxy port." + } + ] + }, + "flags": {}, + "groups": [], + "id": 258, + "module": "proxy_configuration", + "name": "port", + "parsedDocstring": { + "text": "The proxy port." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/proxy_configuration.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 38 + } + ], + "type": { + "name": "int", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The username for the proxy." + } + ] + }, + "flags": {}, + "groups": [], + "id": 259, + "module": "proxy_configuration", + "name": "username", + "parsedDocstring": { + "text": "The username for the proxy." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/proxy_configuration.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 41 + } + ], + "type": { + "name": "str", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The password for the proxy." + } + ] + }, + "flags": {}, + "groups": [], + "id": 260, + "module": "proxy_configuration", + "name": "password", + "parsedDocstring": { + "text": "The password for the proxy." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/proxy_configuration.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 44 + } + ], + "type": { + "name": "str", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The identifier of the used proxy session, if used.\nUsing the same session ID guarantees getting the same proxy URL." + } + ] + }, + "flags": {}, + "groups": [], + "id": 261, + "module": "proxy_configuration", + "name": "session_id", + "parsedDocstring": { + "text": "The identifier of the used proxy session, if used.\nUsing the same session ID guarantees getting the same proxy URL." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/proxy_configuration.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 47 + } + ], + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The tier of the proxy." + } + ] + }, + "flags": {}, + "groups": [], + "id": 262, + "module": "proxy_configuration", + "name": "proxy_tier", + "parsedDocstring": { + "text": "The tier of the proxy." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/proxy_configuration.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 51 + } + ], + "type": { + "name": "int | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Provides information about a proxy connection that is used for requests." + } + ] + }, + "decorations": [ + { + "name": "dataclass" + }, + { + "args": "('Other')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 257, + 260, + 258, + 262, + 256, + 261, + 255, + 259 + ], + "title": "Properties" + } + ], + "id": 254, + "module": "proxy_configuration", + "name": "ProxyInfo", + "parsedDocstring": { + "text": "Provides information about a proxy connection that is used for requests." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/proxy_configuration.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 26 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n\nExactly one of `proxy_urls`, `tiered_proxy_urls` or `new_url_function` must be specified.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 264, + "module": "proxy_configuration", + "name": "__init__", + "parsedDocstring": { + "text": "Initialize a new instance.\n\nExactly one of `proxy_urls`, `tiered_proxy_urls` or `new_url_function` must be specified.\n", + "args": { + "proxy_urls": "A list of URLs of proxies that will be rotated in a round-robin fashion", + "tiered_proxy_urls": "A list of URL tiers (where a tier is a list of proxy URLs). Crawlers will automatically\ntry to use the lowest tier (smallest index) where blocking does not happen. The proxy URLs in\nthe selected tier will be rotated in a round-robin fashion.", + "new_url_function": "A function that returns a proxy URL for a given Request. This provides full control over\nthe proxy selection mechanism." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/proxy_configuration.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 68 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n\nExactly one of `proxy_urls`, `tiered_proxy_urls` or `new_url_function` must be specified.\n" + } + ] + }, + "flags": {}, + "id": 265, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A list of URLs of proxies that will be rotated in a round-robin fashion" + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 266, + "kind": 32768, + "kindString": "Parameter", + "name": "proxy_urls", + "type": { + "name": "list[str | None] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A function that returns a proxy URL for a given Request. This provides full control over\nthe proxy selection mechanism." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 267, + "kind": 32768, + "kindString": "Parameter", + "name": "new_url_function", + "type": { + "name": "_NewUrlFunction | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "_NewUrlFunction", + "target": "294" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A list of URL tiers (where a tier is a list of proxy URLs). Crawlers will automatically\ntry to use the lowest tier (smallest index) where blocking does not happen. The proxy URLs in\nthe selected tier will be rotated in a round-robin fashion." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 268, + "kind": 32768, + "kindString": "Parameter", + "name": "tiered_proxy_urls", + "type": { + "name": "list[list[str | None]] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + ] + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return a new ProxyInfo object based on the configured proxy rotation strategy.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 269, + "module": "proxy_configuration", + "name": "new_proxy_info", + "parsedDocstring": { + "text": "Return a new ProxyInfo object based on the configured proxy rotation strategy.\n", + "args": { + "session_id": "Session identifier. If provided, same proxy URL will be returned for\nsubsequent calls with this ID. Will be auto-generated for tiered proxies if\nnot provided.", + "request": "Request object used for proxy rotation and tier selection. Required for\ntiered proxies to track retries and adjust tier accordingly.", + "proxy_tier": "Specific proxy tier to use. If not provided, will be automatically\nselected based on configuration." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/proxy_configuration.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 114 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return a new ProxyInfo object based on the configured proxy rotation strategy.\n" + } + ] + }, + "flags": {}, + "id": 270, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "new_proxy_info", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Session identifier. If provided, same proxy URL will be returned for\nsubsequent calls with this ID. Will be auto-generated for tiered proxies if\nnot provided." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 271, + "kind": 32768, + "kindString": "Parameter", + "name": "session_id", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Request object used for proxy rotation and tier selection. Required for\ntiered proxies to track retries and adjust tier accordingly." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 272, + "kind": 32768, + "kindString": "Parameter", + "name": "request", + "type": { + "name": "Request | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Request", + "target": "150" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Specific proxy tier to use. If not provided, will be automatically\nselected based on configuration." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 273, + "kind": 32768, + "kindString": "Parameter", + "name": "proxy_tier", + "type": { + "name": "int | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "ProxyInfo | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ProxyInfo", + "target": "254" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return a proxy URL string based on the configured proxy rotation strategy.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 274, + "module": "proxy_configuration", + "name": "new_url", + "parsedDocstring": { + "text": "Return a proxy URL string based on the configured proxy rotation strategy.\n", + "args": { + "session_id": "Session identifier. If provided, same proxy URL will be returned for\nsubsequent calls with this ID. Will be auto-generated for tiered proxies if\nnot provided.", + "request": "Request object used for proxy rotation and tier selection. Required for\ntiered proxies to track retries and adjust tier accordingly.", + "proxy_tier": "Specific proxy tier to use. If not provided, will be automatically\nselected based on configuration." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/proxy_configuration.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 159 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return a proxy URL string based on the configured proxy rotation strategy.\n" + } + ] + }, + "flags": {}, + "id": 275, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "new_url", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Session identifier. If provided, same proxy URL will be returned for\nsubsequent calls with this ID. Will be auto-generated for tiered proxies if\nnot provided." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 276, + "kind": 32768, + "kindString": "Parameter", + "name": "session_id", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Request object used for proxy rotation and tier selection. Required for\ntiered proxies to track retries and adjust tier accordingly." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 277, + "kind": 32768, + "kindString": "Parameter", + "name": "request", + "type": { + "name": "Request | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Request", + "target": "150" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Specific proxy tier to use. If not provided, will be automatically\nselected based on configuration." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 278, + "kind": 32768, + "kindString": "Parameter", + "name": "proxy_tier", + "type": { + "name": "int | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Configures connection to a proxy server with the provided options.\n\nProxy servers are used to prevent target websites from blocking your crawlers based on IP address rate limits or\nblacklists. Setting proxy configuration in your crawlers automatically configures them to use the selected proxies\nfor all connections. You can get information about the currently used proxy by inspecting the {@apilink ProxyInfo}\nproperty in your crawler's page function. There, you can inspect the proxy's URL and other attributes.\n\nIf you want to use your own proxies, use the {@apilink ProxyConfigurationOptions.proxyUrls} option. Your list of\nproxy URLs will be rotated by the configuration if this option is provided." + } + ] + }, + "decorations": [ + { + "args": "('Configuration')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 264, + 269, + 274 + ], + "title": "Methods" + } + ], + "id": 263, + "module": "proxy_configuration", + "name": "ProxyConfiguration", + "parsedDocstring": { + "text": "Configures connection to a proxy server with the provided options.\n\nProxy servers are used to prevent target websites from blocking your crawlers based on IP address rate limits or\nblacklists. Setting proxy configuration in your crawlers automatically configures them to use the selected proxies\nfor all connections. You can get information about the currently used proxy by inspecting the {@apilink ProxyInfo}\nproperty in your crawler's page function. There, you can inspect the proxy's URL and other attributes.\n\nIf you want to use your own proxies, use the {@apilink ProxyConfigurationOptions.proxyUrls} option. Your list of\nproxy URLs will be rotated by the configuration if this option is provided." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/proxy_configuration.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 56 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 280, + "module": "proxy_configuration", + "name": "__init__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/proxy_configuration.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 227 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 281, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 282, + "kind": 32768, + "kindString": "Parameter", + "name": "tiered_proxy_urls", + "type": { + "name": "list", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "URL" + }, + { + "type": "literal", + "value": null + } + ] + } + ] + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 283, + "module": "proxy_configuration", + "name": "all_urls", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/proxy_configuration.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 233 + } + ], + "type": { + "name": "Sequence", + "type": "reference", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "URL" + }, + { + "type": "literal", + "value": null + } + ] + } + ] + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 284, + "module": "proxy_configuration", + "name": "get_tier_urls", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/proxy_configuration.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 236 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 285, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "get_tier_urls", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 286, + "kind": 32768, + "kindString": "Parameter", + "name": "tier_number", + "type": { + "name": "int", + "type": "reference" + } + } + ], + "type": { + "name": "Sequence", + "type": "reference", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "URL" + }, + { + "type": "literal", + "value": null + } + ] + } + ] + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 287, + "module": "proxy_configuration", + "name": "add_error", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/proxy_configuration.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 239 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 288, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "add_error", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 289, + "kind": 32768, + "kindString": "Parameter", + "name": "domain", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 290, + "kind": 32768, + "kindString": "Parameter", + "name": "tier", + "type": { + "name": "int", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 291, + "module": "proxy_configuration", + "name": "predict_tier", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/proxy_configuration.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 242 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 292, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "predict_tier", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 293, + "kind": 32768, + "kindString": "Parameter", + "name": "domain", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "int", + "type": "reference" + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Tracks the state of currently used proxy tiers and their error frequency for individual crawled domains." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 280, + 287, + 284, + 291 + ], + "title": "Methods" + }, + { + "children": [ + 283 + ], + "title": "Properties" + } + ], + "id": 279, + "module": "proxy_configuration", + "name": "_ProxyTierTracker", + "parsedDocstring": { + "text": "Tracks the state of currently used proxy tiers and their error frequency for individual crawled domains." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/proxy_configuration.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 224 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 295, + "module": "proxy_configuration", + "name": "__call__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/proxy_configuration.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 264 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 296, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__call__", + "parameters": [ + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 297, + "kind": 32768, + "kindString": "Parameter", + "name": "session_id", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 298, + "kind": 32768, + "kindString": "Parameter", + "name": "request", + "type": { + "name": "Request | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Request", + "target": "150" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "str | None | Awaitable[str | None]", + "type": "union", + "types": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + }, + { + "type": "reference", + "name": "Awaitable", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + ] + } + ] + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 295 + ], + "title": "Methods" + } + ], + "id": 294, + "module": "proxy_configuration", + "name": "_NewUrlFunction", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/proxy_configuration.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 263 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 299, + "module": "_types", + "name": "T", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 36 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 300, + "module": "_types", + "name": "HttpMethod", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 38 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 301, + "module": "_types", + "name": "HttpPayload", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 40 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 302, + "module": "_types", + "name": "RequestTransformAction", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 42 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Enqueue strategy to be used for determining which links to extract and enqueue." + } + ] + }, + "flags": {}, + "groups": [], + "id": 303, + "module": "_types", + "name": "EnqueueStrategy", + "parsedDocstring": { + "text": "Enqueue strategy to be used for determining which links to extract and enqueue." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 44 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 304, + "module": "_types", + "name": "SkippedReason", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 47 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 305, + "module": "_types", + "name": "LogLevel", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 49 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 307, + "module": "_types", + "name": "model_config", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 63 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 308, + "module": "_types", + "name": "__getitem__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 75 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 309, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__getitem__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 310, + "kind": 32768, + "kindString": "Parameter", + "name": "key", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "str", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 311, + "module": "_types", + "name": "__setitem__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 78 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 312, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__setitem__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 313, + "kind": 32768, + "kindString": "Parameter", + "name": "key", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 314, + "kind": 32768, + "kindString": "Parameter", + "name": "value", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 315, + "module": "_types", + "name": "__delitem__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 81 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 316, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__delitem__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 317, + "kind": 32768, + "kindString": "Parameter", + "name": "key", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return a new instance of `HttpHeaders` combining this one with another one." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 318, + "module": "_types", + "name": "__or__", + "parsedDocstring": { + "text": "Return a new instance of `HttpHeaders` combining this one with another one." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 84 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return a new instance of `HttpHeaders` combining this one with another one." + } + ] + }, + "flags": {}, + "id": 319, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__or__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 320, + "kind": 32768, + "kindString": "Parameter", + "name": "other", + "type": { + "name": "HttpHeaders", + "type": "reference", + "target": "306" + } + } + ], + "type": { + "name": "HttpHeaders", + "type": "reference", + "target": "306" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Support reversed | operation (other | self)." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 321, + "module": "_types", + "name": "__ror__", + "parsedDocstring": { + "text": "Support reversed | operation (other | self)." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 89 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Support reversed | operation (other | self)." + } + ] + }, + "flags": {}, + "id": 322, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__ror__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 323, + "kind": 32768, + "kindString": "Parameter", + "name": "other", + "type": { + "name": "HttpHeaders", + "type": "reference", + "target": "306" + } + } + ], + "type": { + "name": "HttpHeaders", + "type": "reference", + "target": "306" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 324, + "module": "_types", + "name": "__iter__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 94 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 325, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__iter__", + "parameters": [], + "type": { + "name": "Iterator", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 326, + "module": "_types", + "name": "__len__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 97 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 327, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__len__", + "parameters": [], + "type": { + "name": "int", + "type": "reference" + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A dictionary-like object representing HTTP headers." + } + ] + }, + "decorations": [ + { + "args": "('Other')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 315, + 308, + 324, + 326, + 318, + 321, + 311 + ], + "title": "Methods" + }, + { + "children": [ + 307 + ], + "title": "Properties" + } + ], + "id": 306, + "module": "_types", + "name": "HttpHeaders", + "parsedDocstring": { + "text": "A dictionary-like object representing HTTP headers." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 60 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 329, + "module": "_types", + "name": "__init__", + "parsedDocstring": { + "text": "Initialize a new instance.\n", + "args": { + "min_concurrency": "The minimum number of tasks running in parallel. If you set this value too high\nwith respect to the available system memory and CPU, your code might run extremely slow or crash.", + "max_concurrency": "The maximum number of tasks running in parallel.", + "max_tasks_per_minute": "The maximum number of tasks per minute the pool can run. By default, this is set\nto infinity, but you can pass any positive, non-zero number.", + "desired_concurrency": "The desired number of tasks that should be running parallel on the start of the pool,\nif there is a large enough supply of them. By default, it is `min_concurrency`." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 105 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n" + } + ] + }, + "flags": {}, + "id": 330, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The minimum number of tasks running in parallel. If you set this value too high\nwith respect to the available system memory and CPU, your code might run extremely slow or crash." + } + ] + }, + "defaultValue": "1", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 331, + "kind": 32768, + "kindString": "Parameter", + "name": "min_concurrency", + "type": { + "name": "int", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The maximum number of tasks running in parallel." + } + ] + }, + "defaultValue": "100", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 332, + "kind": 32768, + "kindString": "Parameter", + "name": "max_concurrency", + "type": { + "name": "int", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The maximum number of tasks per minute the pool can run. By default, this is set\nto infinity, but you can pass any positive, non-zero number." + } + ] + }, + "defaultValue": "float('inf')", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 333, + "kind": 32768, + "kindString": "Parameter", + "name": "max_tasks_per_minute", + "type": { + "name": "float", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The desired number of tasks that should be running parallel on the start of the pool,\nif there is a large enough supply of them. By default, it is `min_concurrency`." + } + ] + }, + "defaultValue": "10", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 334, + "kind": 32768, + "kindString": "Parameter", + "name": "desired_concurrency", + "type": { + "name": "int", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Concurrency settings for AutoscaledPool." + } + ] + }, + "decorations": [ + { + "args": "('Configuration')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 329 + ], + "title": "Methods" + } + ], + "id": 328, + "module": "_types", + "name": "ConcurrencySettings", + "parsedDocstring": { + "text": "Concurrency settings for AutoscaledPool." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 102 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum number of requests to be enqueued." + } + ] + }, + "flags": {}, + "groups": [], + "id": 336, + "module": "_types", + "name": "limit", + "parsedDocstring": { + "text": "Maximum number of requests to be enqueued." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 147 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Base URL to be used for relative URLs." + } + ] + }, + "flags": {}, + "groups": [], + "id": 337, + "module": "_types", + "name": "base_url", + "parsedDocstring": { + "text": "Base URL to be used for relative URLs." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 150 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Enqueue strategy to be used for determining which links to extract and enqueue.\n\nOptions:\nall: Enqueue every link encountered, regardless of the target domain. Use this option to ensure that all\nlinks, including those leading to external websites, are followed.\nsame-domain: Enqueue links that share the same domain name as the current page, including any subdomains.\nThis strategy is ideal for crawling within the same top-level domain while still allowing for subdomain\nexploration.\nsame-hostname: Enqueue links only if they match the exact hostname of the current page. This is the default\nbehavior and restricts the crawl to the current hostname, excluding subdomains.\nsame-origin: Enqueue links that share the same origin as the current page. The origin is defined by the\ncombination of protocol, domain, and port, ensuring a strict scope for the crawl." + } + ] + }, + "flags": {}, + "groups": [], + "id": 338, + "module": "_types", + "name": "strategy", + "parsedDocstring": { + "text": "Enqueue strategy to be used for determining which links to extract and enqueue.\n\nOptions:\nall: Enqueue every link encountered, regardless of the target domain. Use this option to ensure that all\nlinks, including those leading to external websites, are followed.\nsame-domain: Enqueue links that share the same domain name as the current page, including any subdomains.\nThis strategy is ideal for crawling within the same top-level domain while still allowing for subdomain\nexploration.\nsame-hostname: Enqueue links only if they match the exact hostname of the current page. This is the default\nbehavior and restricts the crawl to the current hostname, excluding subdomains.\nsame-origin: Enqueue links that share the same origin as the current page. The origin is defined by the\ncombination of protocol, domain, and port, ensuring a strict scope for the crawl." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 153 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "EnqueueStrategy", + "target": "303" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "List of regular expressions or globs that URLs must match to be enqueued." + } + ] + }, + "flags": {}, + "groups": [], + "id": 339, + "module": "_types", + "name": "include", + "parsedDocstring": { + "text": "List of regular expressions or globs that URLs must match to be enqueued." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 168 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Sequence", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "re.Pattern" + }, + { + "type": "reference", + "name": "Glob", + "target": "784" + } + ] + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "List of regular expressions or globs that URLs must not match to be enqueued." + } + ] + }, + "flags": {}, + "groups": [], + "id": 340, + "module": "_types", + "name": "exclude", + "parsedDocstring": { + "text": "List of regular expressions or globs that URLs must not match to be enqueued." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 171 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Sequence", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "re.Pattern" + }, + { + "type": "reference", + "name": "Glob", + "target": "784" + } + ] + } + ] + } + ] + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Keyword arguments for the `enqueue_links` methods." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 337, + 340, + 339, + 336, + 338 + ], + "title": "Properties" + } + ], + "id": 335, + "module": "_types", + "name": "EnqueueLinksKwargs", + "parsedDocstring": { + "text": "Keyword arguments for the `enqueue_links` methods." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 144 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedBy": [ + { + "name": "AddRequestsKwargs", + "target": "341", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Requests to be added to the `RequestManager`." + } + ] + }, + "flags": {}, + "groups": [], + "id": 342, + "module": "_types", + "name": "requests", + "parsedDocstring": { + "text": "Requests to be added to the `RequestManager`." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 178 + } + ], + "type": { + "name": "Sequence", + "type": "reference", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Request", + "target": "150" + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "ID of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be provided." + } + ] + }, + "flags": {}, + "groups": [], + "id": 343, + "module": "_types", + "name": "rq_id", + "parsedDocstring": { + "text": "ID of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be provided." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 181 + } + ], + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Name of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be provided." + } + ] + }, + "flags": {}, + "groups": [], + "id": 344, + "module": "_types", + "name": "rq_name", + "parsedDocstring": { + "text": "Name of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be provided." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 184 + } + ], + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Alias of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be provided." + } + ] + }, + "flags": {}, + "groups": [], + "id": 345, + "module": "_types", + "name": "rq_alias", + "parsedDocstring": { + "text": "Alias of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be provided." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 188 + } + ], + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum number of requests to be enqueued." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4392, + "module": "_types", + "name": "limit", + "parsedDocstring": { + "text": "Maximum number of requests to be enqueued." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 147 + } + ], + "type": { + "name": "NotRequired[int]", + "type": "reference" + }, + "inheritedFrom": { + "name": "EnqueueLinksKwargs.limit", + "target": 336, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Base URL to be used for relative URLs." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4393, + "module": "_types", + "name": "base_url", + "parsedDocstring": { + "text": "Base URL to be used for relative URLs." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 150 + } + ], + "type": { + "name": "NotRequired[str]", + "type": "reference" + }, + "inheritedFrom": { + "name": "EnqueueLinksKwargs.base_url", + "target": 337, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Enqueue strategy to be used for determining which links to extract and enqueue.\n\nOptions:\nall: Enqueue every link encountered, regardless of the target domain. Use this option to ensure that all\nlinks, including those leading to external websites, are followed.\nsame-domain: Enqueue links that share the same domain name as the current page, including any subdomains.\nThis strategy is ideal for crawling within the same top-level domain while still allowing for subdomain\nexploration.\nsame-hostname: Enqueue links only if they match the exact hostname of the current page. This is the default\nbehavior and restricts the crawl to the current hostname, excluding subdomains.\nsame-origin: Enqueue links that share the same origin as the current page. The origin is defined by the\ncombination of protocol, domain, and port, ensuring a strict scope for the crawl." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4394, + "module": "_types", + "name": "strategy", + "parsedDocstring": { + "text": "Enqueue strategy to be used for determining which links to extract and enqueue.\n\nOptions:\nall: Enqueue every link encountered, regardless of the target domain. Use this option to ensure that all\nlinks, including those leading to external websites, are followed.\nsame-domain: Enqueue links that share the same domain name as the current page, including any subdomains.\nThis strategy is ideal for crawling within the same top-level domain while still allowing for subdomain\nexploration.\nsame-hostname: Enqueue links only if they match the exact hostname of the current page. This is the default\nbehavior and restricts the crawl to the current hostname, excluding subdomains.\nsame-origin: Enqueue links that share the same origin as the current page. The origin is defined by the\ncombination of protocol, domain, and port, ensuring a strict scope for the crawl." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 153 + } + ], + "type": { + "name": "NotRequired[EnqueueStrategy]", + "type": "reference" + }, + "inheritedFrom": { + "name": "EnqueueLinksKwargs.strategy", + "target": 338, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "List of regular expressions or globs that URLs must match to be enqueued." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4395, + "module": "_types", + "name": "include", + "parsedDocstring": { + "text": "List of regular expressions or globs that URLs must match to be enqueued." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 168 + } + ], + "type": { + "name": "NotRequired[Sequence[re.Pattern | Glob]]", + "type": "reference" + }, + "inheritedFrom": { + "name": "EnqueueLinksKwargs.include", + "target": 339, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "List of regular expressions or globs that URLs must not match to be enqueued." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4396, + "module": "_types", + "name": "exclude", + "parsedDocstring": { + "text": "List of regular expressions or globs that URLs must not match to be enqueued." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 171 + } + ], + "type": { + "name": "NotRequired[Sequence[re.Pattern | Glob]]", + "type": "reference" + }, + "inheritedFrom": { + "name": "EnqueueLinksKwargs.exclude", + "target": 340, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Keyword arguments for the `add_requests` methods." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 4393, + 4396, + 4395, + 4392, + 342, + 345, + 343, + 344, + 4394 + ], + "title": "Properties" + } + ], + "id": 341, + "module": "_types", + "name": "AddRequestsKwargs", + "parsedDocstring": { + "text": "Keyword arguments for the `add_requests` methods." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 175 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "EnqueueLinksKwargs", + "target": "335", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Keyword arguments for dataset's `push_data` method." + } + ] + }, + "flags": {}, + "groups": [], + "id": 346, + "module": "_types", + "name": "PushDataKwargs", + "parsedDocstring": { + "text": "Keyword arguments for dataset's `push_data` method." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 193 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedBy": [ + { + "name": "PushDataFunctionCall", + "target": "347", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 348, + "module": "_types", + "name": "data", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 198 + } + ], + "type": { + "name": "list[dict[str, Any]] | dict[str, Any]", + "type": "union", + "types": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "dict", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Any" + } + ] + } + ] + }, + { + "type": "reference", + "name": "dict", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Any" + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 349, + "module": "_types", + "name": "dataset_id", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 199 + } + ], + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 350, + "module": "_types", + "name": "dataset_name", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 200 + } + ], + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 351, + "module": "_types", + "name": "dataset_alias", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 201 + } + ], + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 348, + 351, + 349, + 350 + ], + "title": "Properties" + } + ], + "id": 347, + "module": "_types", + "name": "PushDataFunctionCall", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 197 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "PushDataKwargs", + "target": "346", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 353, + "module": "_types", + "name": "get_value", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 216 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 354, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_value", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 355, + "kind": 32768, + "kindString": "Parameter", + "name": "key", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 356, + "kind": 32768, + "kindString": "Parameter", + "name": "default_value", + "type": { + "name": "T | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "T", + "target": "299" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "T | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "T", + "target": "299" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "flags": {}, + "id": 362, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_value", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 363, + "kind": 32768, + "kindString": "Parameter", + "name": "key", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "Any", + "type": "reference" + } + }, + { + "flags": {}, + "id": 364, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_value", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 365, + "kind": 32768, + "kindString": "Parameter", + "name": "key", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 366, + "kind": 32768, + "kindString": "Parameter", + "name": "default_value", + "type": { + "name": "T", + "type": "reference", + "target": "299" + } + } + ], + "type": { + "name": "T", + "type": "reference", + "target": "299" + } + }, + { + "flags": {}, + "id": 367, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_value", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 368, + "kind": 32768, + "kindString": "Parameter", + "name": "key", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 369, + "kind": 32768, + "kindString": "Parameter", + "name": "default_value", + "type": { + "name": "T | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "T", + "target": "299" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "T | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "T", + "target": "299" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 357, + "module": "_types", + "name": "set_value", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 218 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 358, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "set_value", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 359, + "kind": 32768, + "kindString": "Parameter", + "name": "key", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 360, + "kind": 32768, + "kindString": "Parameter", + "name": "value", + "type": { + "name": "Any", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 361, + "kind": 32768, + "kindString": "Parameter", + "name": "content_type", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The (limited) part of the `KeyValueStore` interface that should be accessible from a request handler." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 353, + 357 + ], + "title": "Methods" + } + ], + "id": 352, + "module": "_types", + "name": "KeyValueStoreInterface", + "parsedDocstring": { + "text": "The (limited) part of the `KeyValueStore` interface that should be accessible from a request handler." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 204 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 371, + "module": "_types", + "name": "content", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 228 + } + ], + "type": { + "name": "Any", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 372, + "module": "_types", + "name": "content_type", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 229 + } + ], + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "args": "()", + "name": "dataclass" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 371, + 372 + ], + "title": "Properties" + } + ], + "id": 370, + "module": "_types", + "name": "KeyValueStoreValue", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 227 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 374, + "module": "_types", + "name": "__init__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 233 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 375, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 376, + "kind": 32768, + "kindString": "Parameter", + "name": "actual_key_value_store", + "type": { + "name": "KeyValueStore", + "type": "reference", + "target": "3700" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 377, + "module": "_types", + "name": "set_value", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 237 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 378, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "set_value", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 379, + "kind": 32768, + "kindString": "Parameter", + "name": "key", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 380, + "kind": 32768, + "kindString": "Parameter", + "name": "value", + "type": { + "name": "Any", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 381, + "kind": 32768, + "kindString": "Parameter", + "name": "content_type", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 382, + "module": "_types", + "name": "get_value", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 254 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 383, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_value", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 384, + "kind": 32768, + "kindString": "Parameter", + "name": "key", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 385, + "kind": 32768, + "kindString": "Parameter", + "name": "default_value", + "type": { + "name": "T | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "T", + "target": "299" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "T | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "T", + "target": "299" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "flags": {}, + "id": 386, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_value", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 387, + "kind": 32768, + "kindString": "Parameter", + "name": "key", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "Any", + "type": "reference" + } + }, + { + "flags": {}, + "id": 388, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_value", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 389, + "kind": 32768, + "kindString": "Parameter", + "name": "key", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 390, + "kind": 32768, + "kindString": "Parameter", + "name": "default_value", + "type": { + "name": "T", + "type": "reference", + "target": "299" + } + } + ], + "type": { + "name": "T", + "type": "reference", + "target": "299" + } + }, + { + "flags": {}, + "id": 391, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_value", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 392, + "kind": 32768, + "kindString": "Parameter", + "name": "key", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 393, + "kind": 32768, + "kindString": "Parameter", + "name": "default_value", + "type": { + "name": "T | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "T", + "target": "299" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "T | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "T", + "target": "299" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 374, + 382, + 377 + ], + "title": "Methods" + } + ], + "id": 373, + "module": "_types", + "name": "KeyValueStoreChangeRecords", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 232 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 395, + "module": "_types", + "name": "__init__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 264 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 396, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 397, + "kind": 32768, + "kindString": "Parameter", + "name": "key_value_store_getter", + "type": { + "name": "GetKeyValueStoreFunction", + "type": "reference", + "target": "468" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 398, + "kind": 32768, + "kindString": "Parameter", + "name": "request", + "type": { + "name": "Request", + "type": "reference", + "target": "150" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 399, + "module": "_types", + "name": "request", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 279 + } + ], + "type": { + "name": "Request", + "type": "reference", + "target": "150" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Track a call to the `add_requests` context helper." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 400, + "module": "_types", + "name": "add_requests", + "parsedDocstring": { + "text": "Track a call to the `add_requests` context helper." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 282 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Track a call to the `add_requests` context helper." + } + ] + }, + "flags": {}, + "id": 401, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "add_requests", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 402, + "kind": 32768, + "kindString": "Parameter", + "name": "requests", + "type": { + "name": "Sequence", + "type": "reference", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Request", + "target": "150" + } + ] + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 403, + "kind": 32768, + "kindString": "Parameter", + "name": "rq_id", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 404, + "kind": 32768, + "kindString": "Parameter", + "name": "rq_name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 405, + "kind": 32768, + "kindString": "Parameter", + "name": "rq_alias", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum number of requests to be enqueued." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 336, + "module": "_types", + "name": "limit", + "parsedDocstring": { + "text": "Maximum number of requests to be enqueued." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 147 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Base URL to be used for relative URLs." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 337, + "module": "_types", + "name": "base_url", + "parsedDocstring": { + "text": "Base URL to be used for relative URLs." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 150 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Enqueue strategy to be used for determining which links to extract and enqueue.\n\nOptions:\nall: Enqueue every link encountered, regardless of the target domain. Use this option to ensure that all\nlinks, including those leading to external websites, are followed.\nsame-domain: Enqueue links that share the same domain name as the current page, including any subdomains.\nThis strategy is ideal for crawling within the same top-level domain while still allowing for subdomain\nexploration.\nsame-hostname: Enqueue links only if they match the exact hostname of the current page. This is the default\nbehavior and restricts the crawl to the current hostname, excluding subdomains.\nsame-origin: Enqueue links that share the same origin as the current page. The origin is defined by the\ncombination of protocol, domain, and port, ensuring a strict scope for the crawl." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 338, + "module": "_types", + "name": "strategy", + "parsedDocstring": { + "text": "Enqueue strategy to be used for determining which links to extract and enqueue.\n\nOptions:\nall: Enqueue every link encountered, regardless of the target domain. Use this option to ensure that all\nlinks, including those leading to external websites, are followed.\nsame-domain: Enqueue links that share the same domain name as the current page, including any subdomains.\nThis strategy is ideal for crawling within the same top-level domain while still allowing for subdomain\nexploration.\nsame-hostname: Enqueue links only if they match the exact hostname of the current page. This is the default\nbehavior and restricts the crawl to the current hostname, excluding subdomains.\nsame-origin: Enqueue links that share the same origin as the current page. The origin is defined by the\ncombination of protocol, domain, and port, ensuring a strict scope for the crawl." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 153 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "EnqueueStrategy", + "target": "303" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "List of regular expressions or globs that URLs must match to be enqueued." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 339, + "module": "_types", + "name": "include", + "parsedDocstring": { + "text": "List of regular expressions or globs that URLs must match to be enqueued." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 168 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Sequence", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "re.Pattern" + }, + { + "type": "reference", + "name": "Glob", + "target": "784" + } + ] + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "List of regular expressions or globs that URLs must not match to be enqueued." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 340, + "module": "_types", + "name": "exclude", + "parsedDocstring": { + "text": "List of regular expressions or globs that URLs must not match to be enqueued." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 171 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Sequence", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "re.Pattern" + }, + { + "type": "reference", + "name": "Glob", + "target": "784" + } + ] + } + ] + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Track a call to the `push_data` context helper." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 407, + "module": "_types", + "name": "push_data", + "parsedDocstring": { + "text": "Track a call to the `push_data` context helper." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 298 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Track a call to the `push_data` context helper." + } + ] + }, + "flags": {}, + "id": 408, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "push_data", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 409, + "kind": 32768, + "kindString": "Parameter", + "name": "data", + "type": { + "name": "list[dict[str, Any]] | dict[str, Any]", + "type": "union", + "types": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "dict", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Any" + } + ] + } + ] + }, + { + "type": "reference", + "name": "dict", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Any" + } + ] + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 410, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_id", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 411, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 412, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_alias", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 414, + "module": "_types", + "name": "get_key_value_store", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 317 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 415, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_key_value_store", + "parameters": [ + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 416, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 417, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 418, + "kind": 32768, + "kindString": "Parameter", + "name": "alias", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "KeyValueStoreInterface", + "type": "reference", + "target": "352" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Apply tracked changes from handler copy to original request." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 419, + "module": "_types", + "name": "apply_request_changes", + "parsedDocstring": { + "text": "Apply tracked changes from handler copy to original request." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 331 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Apply tracked changes from handler copy to original request." + } + ] + }, + "flags": {}, + "id": 420, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "apply_request_changes", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 421, + "kind": 32768, + "kindString": "Parameter", + "name": "target", + "type": { + "name": "Request", + "type": "reference", + "target": "150" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Record of calls to storage-related context helpers." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 395, + 400, + 419, + 414, + 407 + ], + "title": "Methods" + }, + { + "children": [ + 399 + ], + "title": "Properties" + } + ], + "id": 394, + "module": "_types", + "name": "RequestHandlerRunResult", + "parsedDocstring": { + "text": "Record of calls to storage-related context helpers." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 261 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Call dunder method.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 423, + "module": "_types", + "name": "__call__", + "parsedDocstring": { + "text": "Call dunder method.\n", + "args": { + "requests": "Requests to be added to the `RequestManager`.", + "rq_id": "ID of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be\nprovided.", + "rq_name": "Name of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias`\ncan be provided.", + "rq_alias": "Alias of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias`\ncan be provided.", + "**kwargs": "Additional keyword arguments." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 348 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Call dunder method.\n" + } + ] + }, + "flags": {}, + "id": 424, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__call__", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Requests to be added to the `RequestManager`." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 425, + "kind": 32768, + "kindString": "Parameter", + "name": "requests", + "type": { + "name": "Sequence", + "type": "reference", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Request", + "target": "150" + } + ] + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "ID of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be\nprovided." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 426, + "kind": 32768, + "kindString": "Parameter", + "name": "rq_id", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Name of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias`\ncan be provided." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 427, + "kind": 32768, + "kindString": "Parameter", + "name": "rq_name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Alias of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias`\ncan be provided." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 428, + "kind": 32768, + "kindString": "Parameter", + "name": "rq_alias", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum number of requests to be enqueued." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 336, + "module": "_types", + "name": "limit", + "parsedDocstring": { + "text": "Maximum number of requests to be enqueued." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 147 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Base URL to be used for relative URLs." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 337, + "module": "_types", + "name": "base_url", + "parsedDocstring": { + "text": "Base URL to be used for relative URLs." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 150 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Enqueue strategy to be used for determining which links to extract and enqueue.\n\nOptions:\nall: Enqueue every link encountered, regardless of the target domain. Use this option to ensure that all\nlinks, including those leading to external websites, are followed.\nsame-domain: Enqueue links that share the same domain name as the current page, including any subdomains.\nThis strategy is ideal for crawling within the same top-level domain while still allowing for subdomain\nexploration.\nsame-hostname: Enqueue links only if they match the exact hostname of the current page. This is the default\nbehavior and restricts the crawl to the current hostname, excluding subdomains.\nsame-origin: Enqueue links that share the same origin as the current page. The origin is defined by the\ncombination of protocol, domain, and port, ensuring a strict scope for the crawl." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 338, + "module": "_types", + "name": "strategy", + "parsedDocstring": { + "text": "Enqueue strategy to be used for determining which links to extract and enqueue.\n\nOptions:\nall: Enqueue every link encountered, regardless of the target domain. Use this option to ensure that all\nlinks, including those leading to external websites, are followed.\nsame-domain: Enqueue links that share the same domain name as the current page, including any subdomains.\nThis strategy is ideal for crawling within the same top-level domain while still allowing for subdomain\nexploration.\nsame-hostname: Enqueue links only if they match the exact hostname of the current page. This is the default\nbehavior and restricts the crawl to the current hostname, excluding subdomains.\nsame-origin: Enqueue links that share the same origin as the current page. The origin is defined by the\ncombination of protocol, domain, and port, ensuring a strict scope for the crawl." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 153 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "EnqueueStrategy", + "target": "303" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "List of regular expressions or globs that URLs must match to be enqueued." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 339, + "module": "_types", + "name": "include", + "parsedDocstring": { + "text": "List of regular expressions or globs that URLs must match to be enqueued." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 168 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Sequence", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "re.Pattern" + }, + { + "type": "reference", + "name": "Glob", + "target": "784" + } + ] + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "List of regular expressions or globs that URLs must not match to be enqueued." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 340, + "module": "_types", + "name": "exclude", + "parsedDocstring": { + "text": "List of regular expressions or globs that URLs must not match to be enqueued." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 171 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Sequence", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "re.Pattern" + }, + { + "type": "reference", + "name": "Glob", + "target": "784" + } + ] + } + ] + } + ] + } + } + ], + "type": { + "name": "Coroutine", + "type": "reference", + "typeArguments": [ + { + "type": "literal", + "value": null + }, + { + "type": "literal", + "value": null + }, + { + "type": "literal", + "value": null + } + ] + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Function for adding requests to the `RequestManager`, with optional filtering.\n\nIt simplifies the process of adding requests to the `RequestManager`. It automatically opens\nthe specified one and adds the provided requests." + } + ] + }, + "decorations": [ + { + "args": "('Functions')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 423 + ], + "title": "Methods" + } + ], + "id": 422, + "module": "_types", + "name": "AddRequestsFunction", + "parsedDocstring": { + "text": "Function for adding requests to the `RequestManager`, with optional filtering.\n\nIt simplifies the process of adding requests to the `RequestManager`. It automatically opens\nthe specified one and adds the provided requests." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 341 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Call enqueue links function.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 431, + "module": "_types", + "name": "__call__", + "parsedDocstring": { + "text": "Call enqueue links function.\n", + "args": { + "selector": "A selector used to find the elements containing the links. The behaviour differs based\non the crawler used:\n- `PlaywrightCrawler` supports CSS and XPath selectors.\n- `ParselCrawler` supports CSS selectors.\n- `BeautifulSoupCrawler` supports CSS selectors.", + "attribute": "Which node attribute to extract the links from.", + "label": "Label for the newly created `Request` objects, used for request routing.", + "user_data": "User data to be provided to the newly created `Request` objects.", + "transform_request_function": "A function that takes `RequestOptions` and returns either:\n- Modified `RequestOptions` to update the request configuration,\n- `'skip'` to exclude the request from being enqueued,\n- `'unchanged'` to use the original request options without modification.", + "requests": "Requests to be added to the `RequestManager`.", + "rq_id": "ID of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be\nprovided.", + "rq_name": "Name of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias`\ncan be provided.", + "rq_alias": "Alias of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias`\ncan be provided.", + "**kwargs": "Additional keyword arguments." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 411 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Call enqueue links function.\n" + } + ] + }, + "flags": {}, + "id": 432, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__call__", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A selector used to find the elements containing the links. The behaviour differs based\non the crawler used:\n- `PlaywrightCrawler` supports CSS and XPath selectors.\n- `ParselCrawler` supports CSS selectors.\n- `BeautifulSoupCrawler` supports CSS selectors." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 433, + "kind": 32768, + "kindString": "Parameter", + "name": "selector", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Which node attribute to extract the links from." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 434, + "kind": 32768, + "kindString": "Parameter", + "name": "attribute", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Label for the newly created `Request` objects, used for request routing." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 435, + "kind": 32768, + "kindString": "Parameter", + "name": "label", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "User data to be provided to the newly created `Request` objects." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 436, + "kind": 32768, + "kindString": "Parameter", + "name": "user_data", + "type": { + "name": "dict[str, Any] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "dict", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Any" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A function that takes `RequestOptions` and returns either:\n- Modified `RequestOptions` to update the request configuration,\n- `'skip'` to exclude the request from being enqueued,\n- `'unchanged'` to use the original request options without modification." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 437, + "kind": 32768, + "kindString": "Parameter", + "name": "transform_request_function", + "type": { + "name": "Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Callable", + "typeArguments": [ + { + "type": "reference", + "name": "[RequestOptions]" + }, + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "RequestOptions", + "target": "134" + }, + { + "type": "reference", + "name": "RequestTransformAction", + "target": "302" + } + ] + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Requests to be added to the `RequestManager`." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 438, + "kind": 32768, + "kindString": "Parameter", + "name": "requests", + "type": { + "name": "Sequence[str | Request] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Sequence", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Request", + "target": "150" + } + ] + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "ID of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be\nprovided." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 439, + "kind": 32768, + "kindString": "Parameter", + "name": "rq_id", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Name of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias`\ncan be provided." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 440, + "kind": 32768, + "kindString": "Parameter", + "name": "rq_name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Alias of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias`\ncan be provided." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 441, + "kind": 32768, + "kindString": "Parameter", + "name": "rq_alias", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum number of requests to be enqueued." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 336, + "module": "_types", + "name": "limit", + "parsedDocstring": { + "text": "Maximum number of requests to be enqueued." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 147 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Base URL to be used for relative URLs." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 337, + "module": "_types", + "name": "base_url", + "parsedDocstring": { + "text": "Base URL to be used for relative URLs." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 150 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Enqueue strategy to be used for determining which links to extract and enqueue.\n\nOptions:\nall: Enqueue every link encountered, regardless of the target domain. Use this option to ensure that all\nlinks, including those leading to external websites, are followed.\nsame-domain: Enqueue links that share the same domain name as the current page, including any subdomains.\nThis strategy is ideal for crawling within the same top-level domain while still allowing for subdomain\nexploration.\nsame-hostname: Enqueue links only if they match the exact hostname of the current page. This is the default\nbehavior and restricts the crawl to the current hostname, excluding subdomains.\nsame-origin: Enqueue links that share the same origin as the current page. The origin is defined by the\ncombination of protocol, domain, and port, ensuring a strict scope for the crawl." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 338, + "module": "_types", + "name": "strategy", + "parsedDocstring": { + "text": "Enqueue strategy to be used for determining which links to extract and enqueue.\n\nOptions:\nall: Enqueue every link encountered, regardless of the target domain. Use this option to ensure that all\nlinks, including those leading to external websites, are followed.\nsame-domain: Enqueue links that share the same domain name as the current page, including any subdomains.\nThis strategy is ideal for crawling within the same top-level domain while still allowing for subdomain\nexploration.\nsame-hostname: Enqueue links only if they match the exact hostname of the current page. This is the default\nbehavior and restricts the crawl to the current hostname, excluding subdomains.\nsame-origin: Enqueue links that share the same origin as the current page. The origin is defined by the\ncombination of protocol, domain, and port, ensuring a strict scope for the crawl." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 153 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "EnqueueStrategy", + "target": "303" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "List of regular expressions or globs that URLs must match to be enqueued." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 339, + "module": "_types", + "name": "include", + "parsedDocstring": { + "text": "List of regular expressions or globs that URLs must match to be enqueued." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 168 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Sequence", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "re.Pattern" + }, + { + "type": "reference", + "name": "Glob", + "target": "784" + } + ] + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "List of regular expressions or globs that URLs must not match to be enqueued." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 340, + "module": "_types", + "name": "exclude", + "parsedDocstring": { + "text": "List of regular expressions or globs that URLs must not match to be enqueued." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 171 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Sequence", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "re.Pattern" + }, + { + "type": "reference", + "name": "Glob", + "target": "784" + } + ] + } + ] + } + ] + } + } + ], + "type": { + "name": "Coroutine", + "type": "reference", + "typeArguments": [ + { + "type": "literal", + "value": null + }, + { + "type": "literal", + "value": null + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Call enqueue links function.\n" + } + ] + }, + "flags": {}, + "id": 443, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__call__", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A selector used to find the elements containing the links. The behaviour differs based\non the crawler used:\n- `PlaywrightCrawler` supports CSS and XPath selectors.\n- `ParselCrawler` supports CSS selectors.\n- `BeautifulSoupCrawler` supports CSS selectors." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 444, + "kind": 32768, + "kindString": "Parameter", + "name": "selector", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Which node attribute to extract the links from." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 445, + "kind": 32768, + "kindString": "Parameter", + "name": "attribute", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Label for the newly created `Request` objects, used for request routing." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 446, + "kind": 32768, + "kindString": "Parameter", + "name": "label", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "User data to be provided to the newly created `Request` objects." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 447, + "kind": 32768, + "kindString": "Parameter", + "name": "user_data", + "type": { + "name": "dict[str, Any] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "dict", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Any" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A function that takes `RequestOptions` and returns either:\n- Modified `RequestOptions` to update the request configuration,\n- `'skip'` to exclude the request from being enqueued,\n- `'unchanged'` to use the original request options without modification." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 448, + "kind": 32768, + "kindString": "Parameter", + "name": "transform_request_function", + "type": { + "name": "Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Callable", + "typeArguments": [ + { + "type": "reference", + "name": "[RequestOptions]" + }, + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "RequestOptions", + "target": "134" + }, + { + "type": "reference", + "name": "RequestTransformAction", + "target": "302" + } + ] + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "ID of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be\nprovided." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 449, + "kind": 32768, + "kindString": "Parameter", + "name": "rq_id", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Name of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias`\ncan be provided." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 450, + "kind": 32768, + "kindString": "Parameter", + "name": "rq_name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Alias of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias`\ncan be provided." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 451, + "kind": 32768, + "kindString": "Parameter", + "name": "rq_alias", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum number of requests to be enqueued." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 336, + "module": "_types", + "name": "limit", + "parsedDocstring": { + "text": "Maximum number of requests to be enqueued." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 147 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Base URL to be used for relative URLs." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 337, + "module": "_types", + "name": "base_url", + "parsedDocstring": { + "text": "Base URL to be used for relative URLs." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 150 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Enqueue strategy to be used for determining which links to extract and enqueue.\n\nOptions:\nall: Enqueue every link encountered, regardless of the target domain. Use this option to ensure that all\nlinks, including those leading to external websites, are followed.\nsame-domain: Enqueue links that share the same domain name as the current page, including any subdomains.\nThis strategy is ideal for crawling within the same top-level domain while still allowing for subdomain\nexploration.\nsame-hostname: Enqueue links only if they match the exact hostname of the current page. This is the default\nbehavior and restricts the crawl to the current hostname, excluding subdomains.\nsame-origin: Enqueue links that share the same origin as the current page. The origin is defined by the\ncombination of protocol, domain, and port, ensuring a strict scope for the crawl." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 338, + "module": "_types", + "name": "strategy", + "parsedDocstring": { + "text": "Enqueue strategy to be used for determining which links to extract and enqueue.\n\nOptions:\nall: Enqueue every link encountered, regardless of the target domain. Use this option to ensure that all\nlinks, including those leading to external websites, are followed.\nsame-domain: Enqueue links that share the same domain name as the current page, including any subdomains.\nThis strategy is ideal for crawling within the same top-level domain while still allowing for subdomain\nexploration.\nsame-hostname: Enqueue links only if they match the exact hostname of the current page. This is the default\nbehavior and restricts the crawl to the current hostname, excluding subdomains.\nsame-origin: Enqueue links that share the same origin as the current page. The origin is defined by the\ncombination of protocol, domain, and port, ensuring a strict scope for the crawl." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 153 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "EnqueueStrategy", + "target": "303" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "List of regular expressions or globs that URLs must match to be enqueued." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 339, + "module": "_types", + "name": "include", + "parsedDocstring": { + "text": "List of regular expressions or globs that URLs must match to be enqueued." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 168 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Sequence", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "re.Pattern" + }, + { + "type": "reference", + "name": "Glob", + "target": "784" + } + ] + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "List of regular expressions or globs that URLs must not match to be enqueued." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 340, + "module": "_types", + "name": "exclude", + "parsedDocstring": { + "text": "List of regular expressions or globs that URLs must not match to be enqueued." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 171 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Sequence", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "re.Pattern" + }, + { + "type": "reference", + "name": "Glob", + "target": "784" + } + ] + } + ] + } + ] + } + } + ], + "type": { + "name": "Coroutine", + "type": "reference", + "typeArguments": [ + { + "type": "literal", + "value": null + }, + { + "type": "literal", + "value": null + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Call enqueue links function.\n" + } + ] + }, + "flags": {}, + "id": 453, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__call__", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Requests to be added to the `RequestManager`." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 454, + "kind": 32768, + "kindString": "Parameter", + "name": "requests", + "type": { + "name": "Sequence[str | Request] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Sequence", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Request", + "target": "150" + } + ] + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "ID of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be\nprovided." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 455, + "kind": 32768, + "kindString": "Parameter", + "name": "rq_id", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Name of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias`\ncan be provided." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 456, + "kind": 32768, + "kindString": "Parameter", + "name": "rq_name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Alias of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias`\ncan be provided." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 457, + "kind": 32768, + "kindString": "Parameter", + "name": "rq_alias", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum number of requests to be enqueued." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 336, + "module": "_types", + "name": "limit", + "parsedDocstring": { + "text": "Maximum number of requests to be enqueued." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 147 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Base URL to be used for relative URLs." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 337, + "module": "_types", + "name": "base_url", + "parsedDocstring": { + "text": "Base URL to be used for relative URLs." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 150 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Enqueue strategy to be used for determining which links to extract and enqueue.\n\nOptions:\nall: Enqueue every link encountered, regardless of the target domain. Use this option to ensure that all\nlinks, including those leading to external websites, are followed.\nsame-domain: Enqueue links that share the same domain name as the current page, including any subdomains.\nThis strategy is ideal for crawling within the same top-level domain while still allowing for subdomain\nexploration.\nsame-hostname: Enqueue links only if they match the exact hostname of the current page. This is the default\nbehavior and restricts the crawl to the current hostname, excluding subdomains.\nsame-origin: Enqueue links that share the same origin as the current page. The origin is defined by the\ncombination of protocol, domain, and port, ensuring a strict scope for the crawl." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 338, + "module": "_types", + "name": "strategy", + "parsedDocstring": { + "text": "Enqueue strategy to be used for determining which links to extract and enqueue.\n\nOptions:\nall: Enqueue every link encountered, regardless of the target domain. Use this option to ensure that all\nlinks, including those leading to external websites, are followed.\nsame-domain: Enqueue links that share the same domain name as the current page, including any subdomains.\nThis strategy is ideal for crawling within the same top-level domain while still allowing for subdomain\nexploration.\nsame-hostname: Enqueue links only if they match the exact hostname of the current page. This is the default\nbehavior and restricts the crawl to the current hostname, excluding subdomains.\nsame-origin: Enqueue links that share the same origin as the current page. The origin is defined by the\ncombination of protocol, domain, and port, ensuring a strict scope for the crawl." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 153 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "EnqueueStrategy", + "target": "303" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "List of regular expressions or globs that URLs must match to be enqueued." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 339, + "module": "_types", + "name": "include", + "parsedDocstring": { + "text": "List of regular expressions or globs that URLs must match to be enqueued." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 168 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Sequence", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "re.Pattern" + }, + { + "type": "reference", + "name": "Glob", + "target": "784" + } + ] + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "List of regular expressions or globs that URLs must not match to be enqueued." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 340, + "module": "_types", + "name": "exclude", + "parsedDocstring": { + "text": "List of regular expressions or globs that URLs must not match to be enqueued." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 171 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Sequence", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "re.Pattern" + }, + { + "type": "reference", + "name": "Glob", + "target": "784" + } + ] + } + ] + } + ] + } + } + ], + "type": { + "name": "Coroutine", + "type": "reference", + "typeArguments": [ + { + "type": "literal", + "value": null + }, + { + "type": "literal", + "value": null + }, + { + "type": "literal", + "value": null + } + ] + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A function for enqueueing new URLs to crawl based on elements selected by a given selector or explicit requests.\n\nIt adds explicitly passed `requests` to the `RequestManager` or it extracts URLs from the current page and enqueues\nthem for further crawling. It allows filtering through selectors and other options. You can also specify labels and\nuser data to be associated with the newly created `Request` objects.\n\nIt should not be called with `selector`, `label`, `user_data` or `transform_request_function` arguments together\nwith `requests` argument.\n\nFor even more control over the enqueued links you can use combination of `ExtractLinksFunction` and\n`AddRequestsFunction`." + } + ] + }, + "decorations": [ + { + "args": "('Functions')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 431 + ], + "title": "Methods" + } + ], + "id": 430, + "module": "_types", + "name": "EnqueueLinksFunction", + "parsedDocstring": { + "text": "A function for enqueueing new URLs to crawl based on elements selected by a given selector or explicit requests.\n\nIt adds explicitly passed `requests` to the `RequestManager` or it extracts URLs from the current page and enqueues\nthem for further crawling. It allows filtering through selectors and other options. You can also specify labels and\nuser data to be associated with the newly created `Request` objects.\n\nIt should not be called with `selector`, `label`, `user_data` or `transform_request_function` arguments together\nwith `requests` argument.\n\nFor even more control over the enqueued links you can use combination of `ExtractLinksFunction` and\n`AddRequestsFunction`." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 371 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Call extract links function.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 460, + "module": "_types", + "name": "__call__", + "parsedDocstring": { + "text": "Call extract links function.\n", + "args": { + "selector": "A selector used to find the elements containing the links. The behaviour differs based\non the crawler used:\n- `PlaywrightCrawler` supports CSS and XPath selectors.\n- `ParselCrawler` supports CSS selectors.\n- `BeautifulSoupCrawler` supports CSS selectors.", + "attribute": "Which node attribute to extract the links from.", + "label": "Label for the newly created `Request` objects, used for request routing.", + "user_data": "User data to be provided to the newly created `Request` objects.", + "transform_request_function": "A function that takes `RequestOptions` and returns either:\n- Modified `RequestOptions` to update the request configuration,\n- `'skip'` to exclude the request from being enqueued,\n- `'unchanged'` to use the original request options without modification.", + "**kwargs": "Additional keyword arguments." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 459 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Call extract links function.\n" + } + ] + }, + "flags": {}, + "id": 461, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__call__", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A selector used to find the elements containing the links. The behaviour differs based\non the crawler used:\n- `PlaywrightCrawler` supports CSS and XPath selectors.\n- `ParselCrawler` supports CSS selectors.\n- `BeautifulSoupCrawler` supports CSS selectors." + } + ] + }, + "defaultValue": "'a'", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 462, + "kind": 32768, + "kindString": "Parameter", + "name": "selector", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Which node attribute to extract the links from." + } + ] + }, + "defaultValue": "'href'", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 463, + "kind": 32768, + "kindString": "Parameter", + "name": "attribute", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Label for the newly created `Request` objects, used for request routing." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 464, + "kind": 32768, + "kindString": "Parameter", + "name": "label", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "User data to be provided to the newly created `Request` objects." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 465, + "kind": 32768, + "kindString": "Parameter", + "name": "user_data", + "type": { + "name": "dict[str, Any] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "dict", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Any" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A function that takes `RequestOptions` and returns either:\n- Modified `RequestOptions` to update the request configuration,\n- `'skip'` to exclude the request from being enqueued,\n- `'unchanged'` to use the original request options without modification." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 466, + "kind": 32768, + "kindString": "Parameter", + "name": "transform_request_function", + "type": { + "name": "Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Callable", + "typeArguments": [ + { + "type": "reference", + "name": "[RequestOptions]" + }, + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "RequestOptions", + "target": "134" + }, + { + "type": "reference", + "name": "RequestTransformAction", + "target": "302" + } + ] + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum number of requests to be enqueued." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 336, + "module": "_types", + "name": "limit", + "parsedDocstring": { + "text": "Maximum number of requests to be enqueued." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 147 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Base URL to be used for relative URLs." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 337, + "module": "_types", + "name": "base_url", + "parsedDocstring": { + "text": "Base URL to be used for relative URLs." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 150 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Enqueue strategy to be used for determining which links to extract and enqueue.\n\nOptions:\nall: Enqueue every link encountered, regardless of the target domain. Use this option to ensure that all\nlinks, including those leading to external websites, are followed.\nsame-domain: Enqueue links that share the same domain name as the current page, including any subdomains.\nThis strategy is ideal for crawling within the same top-level domain while still allowing for subdomain\nexploration.\nsame-hostname: Enqueue links only if they match the exact hostname of the current page. This is the default\nbehavior and restricts the crawl to the current hostname, excluding subdomains.\nsame-origin: Enqueue links that share the same origin as the current page. The origin is defined by the\ncombination of protocol, domain, and port, ensuring a strict scope for the crawl." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 338, + "module": "_types", + "name": "strategy", + "parsedDocstring": { + "text": "Enqueue strategy to be used for determining which links to extract and enqueue.\n\nOptions:\nall: Enqueue every link encountered, regardless of the target domain. Use this option to ensure that all\nlinks, including those leading to external websites, are followed.\nsame-domain: Enqueue links that share the same domain name as the current page, including any subdomains.\nThis strategy is ideal for crawling within the same top-level domain while still allowing for subdomain\nexploration.\nsame-hostname: Enqueue links only if they match the exact hostname of the current page. This is the default\nbehavior and restricts the crawl to the current hostname, excluding subdomains.\nsame-origin: Enqueue links that share the same origin as the current page. The origin is defined by the\ncombination of protocol, domain, and port, ensuring a strict scope for the crawl." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 153 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "EnqueueStrategy", + "target": "303" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "List of regular expressions or globs that URLs must match to be enqueued." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 339, + "module": "_types", + "name": "include", + "parsedDocstring": { + "text": "List of regular expressions or globs that URLs must match to be enqueued." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 168 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Sequence", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "re.Pattern" + }, + { + "type": "reference", + "name": "Glob", + "target": "784" + } + ] + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "List of regular expressions or globs that URLs must not match to be enqueued." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 340, + "module": "_types", + "name": "exclude", + "parsedDocstring": { + "text": "List of regular expressions or globs that URLs must not match to be enqueued." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 171 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Sequence", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "re.Pattern" + }, + { + "type": "reference", + "name": "Glob", + "target": "784" + } + ] + } + ] + } + ] + } + } + ], + "type": { + "name": "Coroutine", + "type": "reference", + "typeArguments": [ + { + "type": "literal", + "value": null + }, + { + "type": "literal", + "value": null + }, + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "Request", + "target": "150" + } + ] + } + ] + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A function for extracting URLs to crawl based on elements selected by a given selector.\n\nIt extracts URLs from the current page and allows filtering through selectors and other options. You can also\nspecify labels and user data to be associated with the newly created `Request` objects." + } + ] + }, + "decorations": [ + { + "args": "('Functions')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 460 + ], + "title": "Methods" + } + ], + "id": 459, + "module": "_types", + "name": "ExtractLinksFunction", + "parsedDocstring": { + "text": "A function for extracting URLs to crawl based on elements selected by a given selector.\n\nIt extracts URLs from the current page and allows filtering through selectors and other options. You can also\nspecify labels and user data to be associated with the newly created `Request` objects." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 452 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Call dunder method.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 469, + "module": "_types", + "name": "__call__", + "parsedDocstring": { + "text": "Call dunder method.\n", + "args": { + "id": "The ID of the `KeyValueStore` to get.", + "name": "The name of the `KeyValueStore` to get (global scope, named storage).", + "alias": "The alias of the `KeyValueStore` to get (run scope, unnamed storage)." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 495 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Call dunder method.\n" + } + ] + }, + "flags": {}, + "id": 470, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__call__", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The ID of the `KeyValueStore` to get." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 471, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The name of the `KeyValueStore` to get (global scope, named storage)." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 472, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The alias of the `KeyValueStore` to get (run scope, unnamed storage)." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 473, + "kind": 32768, + "kindString": "Parameter", + "name": "alias", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "Coroutine", + "type": "reference", + "typeArguments": [ + { + "type": "literal", + "value": null + }, + { + "type": "literal", + "value": null + }, + { + "type": "reference", + "name": "KeyValueStore", + "target": "3700" + } + ] + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A function for accessing a `KeyValueStore`.\n\nIt retrieves an instance of a `KeyValueStore` based on its ID or name." + } + ] + }, + "decorations": [ + { + "args": "('Functions')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 469 + ], + "title": "Methods" + } + ], + "id": 468, + "module": "_types", + "name": "GetKeyValueStoreFunction", + "parsedDocstring": { + "text": "A function for accessing a `KeyValueStore`.\n\nIt retrieves an instance of a `KeyValueStore` based on its ID or name." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 489 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Call dunder method.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 475, + "module": "_types", + "name": "__call__", + "parsedDocstring": { + "text": "Call dunder method.\n", + "args": { + "id": "The ID of the `KeyValueStore` to get.", + "name": "The name of the `KeyValueStore` to get (global scope, named storage).", + "alias": "The alias of the `KeyValueStore` to get (run scope, unnamed storage)." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 517 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Call dunder method.\n" + } + ] + }, + "flags": {}, + "id": 476, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__call__", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The ID of the `KeyValueStore` to get." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 477, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The name of the `KeyValueStore` to get (global scope, named storage)." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 478, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The alias of the `KeyValueStore` to get (run scope, unnamed storage)." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 479, + "kind": 32768, + "kindString": "Parameter", + "name": "alias", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "Coroutine", + "type": "reference", + "typeArguments": [ + { + "type": "literal", + "value": null + }, + { + "type": "literal", + "value": null + }, + { + "type": "reference", + "name": "KeyValueStoreInterface", + "target": "352" + } + ] + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A function for accessing a `KeyValueStore`.\n\nIt retrieves an instance of a `KeyValueStore` based on its ID or name." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 475 + ], + "title": "Methods" + } + ], + "id": 474, + "module": "_types", + "name": "GetKeyValueStoreFromRequestHandlerFunction", + "parsedDocstring": { + "text": "A function for accessing a `KeyValueStore`.\n\nIt retrieves an instance of a `KeyValueStore` based on its ID or name." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 511 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Call dunder method.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 481, + "module": "_types", + "name": "__call__", + "parsedDocstring": { + "text": "Call dunder method.\n", + "args": { + "data": "The data to push to the `Dataset`.", + "dataset_id": "The ID of the `Dataset` to push the data to.", + "dataset_name": "The name of the `Dataset` to push the data to (global scope, named storage).", + "dataset_alias": "The alias of the `Dataset` to push the data to (run scope, unnamed storage).", + "**kwargs": "Additional keyword arguments." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 541 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Call dunder method.\n" + } + ] + }, + "flags": {}, + "id": 482, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__call__", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The data to push to the `Dataset`." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 483, + "kind": 32768, + "kindString": "Parameter", + "name": "data", + "type": { + "name": "list[dict[str, Any]] | dict[str, Any]", + "type": "union", + "types": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "dict", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Any" + } + ] + } + ] + }, + { + "type": "reference", + "name": "dict", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Any" + } + ] + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The ID of the `Dataset` to push the data to." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 484, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_id", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The name of the `Dataset` to push the data to (global scope, named storage)." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 485, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The alias of the `Dataset` to push the data to (run scope, unnamed storage)." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 486, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_alias", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "Coroutine", + "type": "reference", + "typeArguments": [ + { + "type": "literal", + "value": null + }, + { + "type": "literal", + "value": null + }, + { + "type": "literal", + "value": null + } + ] + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A function for pushing data to a `Dataset`.\n\nIt simplifies the process of adding data to a `Dataset`. It opens the specified one and pushes\nthe provided data to it." + } + ] + }, + "decorations": [ + { + "args": "('Functions')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 481 + ], + "title": "Methods" + } + ], + "id": 480, + "module": "_types", + "name": "PushDataFunction", + "parsedDocstring": { + "text": "A function for pushing data to a `Dataset`.\n\nIt simplifies the process of adding data to a `Dataset`. It opens the specified one and pushes\nthe provided data to it." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 534 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Call send request function.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 489, + "module": "_types", + "name": "__call__", + "parsedDocstring": { + "text": "Call send request function.\n", + "args": { + "url": "The URL to send the request to.", + "method": "The HTTP method to use.", + "headers": "The headers to include in the request.", + "payload": "The payload to include in the request.\n" + }, + "returns": "The HTTP response received from the server." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 568 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "The HTTP response received from the server." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Call send request function.\n" + } + ] + }, + "flags": {}, + "id": 490, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__call__", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The URL to send the request to." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 491, + "kind": 32768, + "kindString": "Parameter", + "name": "url", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The HTTP method to use." + } + ] + }, + "defaultValue": "'GET'", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 492, + "kind": 32768, + "kindString": "Parameter", + "name": "method", + "type": { + "name": "HttpMethod", + "type": "reference", + "target": "300" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The payload to include in the request.\n" + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 493, + "kind": 32768, + "kindString": "Parameter", + "name": "payload", + "type": { + "name": "HttpPayload | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "HttpPayload", + "target": "301" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The headers to include in the request." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 494, + "kind": 32768, + "kindString": "Parameter", + "name": "headers", + "type": { + "name": "HttpHeaders | dict[str, str] | None", + "type": "union", + "types": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "HttpHeaders", + "target": "306" + }, + { + "type": "reference", + "name": "dict", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "str" + } + ] + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "Coroutine", + "type": "reference", + "typeArguments": [ + { + "type": "literal", + "value": null + }, + { + "type": "literal", + "value": null + }, + { + "type": "reference", + "name": "HttpResponse", + "target": "2060" + } + ] + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A function for sending HTTP requests.\n\nIt simplifies the process of sending HTTP requests. It is implemented by the crawling context and is used\nwithin request handlers to send additional HTTP requests to target URLs." + } + ] + }, + "decorations": [ + { + "args": "('Functions')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 489 + ], + "title": "Methods" + } + ], + "id": 488, + "module": "_types", + "name": "SendRequestFunction", + "parsedDocstring": { + "text": "A function for sending HTTP requests.\n\nIt simplifies the process of sending HTTP requests. It is implemented by the crawling context and is used\nwithin request handlers to send additional HTTP requests to target URLs." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 561 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Screenshot of the page format." + } + ] + }, + "flags": {}, + "groups": [], + "id": 496, + "module": "_types", + "name": "screenshot", + "parsedDocstring": { + "text": "Screenshot of the page format." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 594 + } + ], + "type": { + "name": "bytes | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "bytes", + "target": "682" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTML content of the page." + } + ] + }, + "flags": {}, + "groups": [], + "id": 497, + "module": "_types", + "name": "html", + "parsedDocstring": { + "text": "HTML content of the page." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 597 + } + ], + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 498, + "module": "_types", + "name": "__bool__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 600 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 499, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__bool__", + "parameters": [], + "type": { + "name": "bool", + "type": "reference" + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Snapshot of a crawled page." + } + ] + }, + "decorations": [ + { + "args": "('Other')", + "name": "docs_group" + }, + { + "args": ".dataclass", + "name": "dataclasses" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 498 + ], + "title": "Methods" + }, + { + "children": [ + 497, + 496 + ], + "title": "Properties" + } + ], + "id": 495, + "module": "_types", + "name": "PageSnapshot", + "parsedDocstring": { + "text": "Snapshot of a crawled page." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 591 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Call dunder method.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 501, + "module": "_types", + "name": "__call__", + "parsedDocstring": { + "text": "Call dunder method.\n", + "args": { + "default_value": "The default value to initialize the state if it is not already set.\n" + }, + "returns": "The current state." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 614 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "The current state." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Call dunder method.\n" + } + ] + }, + "flags": {}, + "id": 502, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__call__", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The default value to initialize the state if it is not already set.\n" + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 503, + "kind": 32768, + "kindString": "Parameter", + "name": "default_value", + "type": { + "name": "dict[str, JsonSerializable] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "dict", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "JsonSerializable" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "Coroutine", + "type": "reference", + "typeArguments": [ + { + "type": "literal", + "value": null + }, + { + "type": "literal", + "value": null + }, + { + "type": "reference", + "name": "dict", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "JsonSerializable" + } + ] + } + ] + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A function for managing state within the crawling context.\n\nIt allows the use of persistent state across multiple crawls.\n\n\n:::warning Warning\nThis is an experimental feature. The behavior and interface may change in future versions.\n:::" + } + ] + }, + "decorations": [ + { + "args": "('Functions')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 501 + ], + "title": "Methods" + } + ], + "id": 500, + "module": "_types", + "name": "UseStateFunction", + "parsedDocstring": { + "text": "A function for managing state within the crawling context.\n\nIt allows the use of persistent state across multiple crawls.\n\n\n:::warning Warning\nThis is an experimental feature. The behavior and interface may change in future versions.\n:::" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 605 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Request object for the current page being processed." + } + ] + }, + "flags": {}, + "groups": [], + "id": 505, + "module": "_types", + "name": "request", + "parsedDocstring": { + "text": "Request object for the current page being processed." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 637 + } + ], + "type": { + "name": "Request", + "type": "reference", + "target": "150" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Session object for the current page being processed." + } + ] + }, + "flags": {}, + "groups": [], + "id": 506, + "module": "_types", + "name": "session", + "parsedDocstring": { + "text": "Session object for the current page being processed." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 640 + } + ], + "type": { + "name": "Session | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Session", + "target": "2445" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Proxy information for the current page being processed." + } + ] + }, + "flags": {}, + "groups": [], + "id": 507, + "module": "_types", + "name": "proxy_info", + "parsedDocstring": { + "text": "Proxy information for the current page being processed." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 643 + } + ], + "type": { + "name": "ProxyInfo | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ProxyInfo", + "target": "254" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Send request crawling context helper function." + } + ] + }, + "flags": {}, + "groups": [], + "id": 508, + "module": "_types", + "name": "send_request", + "parsedDocstring": { + "text": "Send request crawling context helper function." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 646 + } + ], + "type": { + "name": "SendRequestFunction", + "type": "reference", + "target": "488" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Add requests crawling context helper function." + } + ] + }, + "flags": {}, + "groups": [], + "id": 509, + "module": "_types", + "name": "add_requests", + "parsedDocstring": { + "text": "Add requests crawling context helper function." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 649 + } + ], + "type": { + "name": "AddRequestsFunction", + "type": "reference", + "target": "422" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Push data crawling context helper function." + } + ] + }, + "flags": {}, + "groups": [], + "id": 510, + "module": "_types", + "name": "push_data", + "parsedDocstring": { + "text": "Push data crawling context helper function." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 652 + } + ], + "type": { + "name": "PushDataFunction", + "type": "reference", + "target": "480" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Use state crawling context helper function." + } + ] + }, + "flags": {}, + "groups": [], + "id": 511, + "module": "_types", + "name": "use_state", + "parsedDocstring": { + "text": "Use state crawling context helper function." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 655 + } + ], + "type": { + "name": "UseStateFunction", + "type": "reference", + "target": "500" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get key-value store crawling context helper function." + } + ] + }, + "flags": {}, + "groups": [], + "id": 512, + "module": "_types", + "name": "get_key_value_store", + "parsedDocstring": { + "text": "Get key-value store crawling context helper function." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 658 + } + ], + "type": { + "name": "GetKeyValueStoreFromRequestHandlerFunction", + "type": "reference", + "target": "474" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Logger instance." + } + ] + }, + "flags": {}, + "groups": [], + "id": 513, + "module": "_types", + "name": "log", + "parsedDocstring": { + "text": "Logger instance." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 661 + } + ], + "type": { + "name": "logging.Logger", + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get snapshot of crawled page." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 514, + "module": "_types", + "name": "get_snapshot", + "parsedDocstring": { + "text": "Get snapshot of crawled page." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 664 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get snapshot of crawled page." + } + ] + }, + "flags": {}, + "id": 515, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_snapshot", + "parameters": [], + "type": { + "name": "PageSnapshot", + "type": "reference", + "target": "495" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return hash of the context. Each context is considered unique." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 516, + "module": "_types", + "name": "__hash__", + "parsedDocstring": { + "text": "Return hash of the context. Each context is considered unique." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 668 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return hash of the context. Each context is considered unique." + } + ] + }, + "flags": {}, + "id": 517, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__hash__", + "parameters": [], + "type": { + "name": "int", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a modified copy of the crawling context with specified changes." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 518, + "module": "_types", + "name": "create_modified_copy", + "parsedDocstring": { + "text": "Create a modified copy of the crawling context with specified changes." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 672 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a modified copy of the crawling context with specified changes." + } + ] + }, + "flags": {}, + "id": 519, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "create_modified_copy", + "parameters": [ + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 520, + "kind": 32768, + "kindString": "Parameter", + "name": "push_data", + "type": { + "name": "PushDataFunction | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "PushDataFunction", + "target": "480" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 521, + "kind": 32768, + "kindString": "Parameter", + "name": "add_requests", + "type": { + "name": "AddRequestsFunction | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "AddRequestsFunction", + "target": "422" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 522, + "kind": 32768, + "kindString": "Parameter", + "name": "get_key_value_store", + "type": { + "name": "GetKeyValueStoreFromRequestHandlerFunction | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "GetKeyValueStoreFromRequestHandlerFunction", + "target": "474" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "Self", + "type": "reference" + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Basic crawling context.\n\nIt represents the fundamental crawling context used by the `BasicCrawler`. It is extended by more\nspecific crawlers to provide additional functionality." + } + ] + }, + "decorations": [ + { + "args": "(frozen=True)", + "name": "dataclass" + }, + { + "args": "('Crawling contexts')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 516, + 518, + 514 + ], + "title": "Methods" + }, + { + "children": [ + 509, + 512, + 513, + 507, + 510, + 505, + 508, + 506, + 511 + ], + "title": "Properties" + } + ], + "id": 504, + "module": "_types", + "name": "BasicCrawlingContext", + "parsedDocstring": { + "text": "Basic crawling context.\n\nIt represents the fundamental crawling context used by the `BasicCrawler`. It is extended by more\nspecific crawlers to provide additional functionality." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 630 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedBy": [ + { + "name": "HttpCrawlingContext", + "target": "1245", + "type": "reference" + }, + { + "name": "AdaptivePlaywrightPreNavCrawlingContext", + "target": "1470", + "type": "reference" + }, + { + "name": "PlaywrightPreNavCrawlingContext", + "target": "1779", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Skips the specified number of items at the start." + } + ] + }, + "flags": {}, + "groups": [], + "id": 524, + "module": "_types", + "name": "offset", + "parsedDocstring": { + "text": "Skips the specified number of items at the start." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 694 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The maximum number of items to retrieve. Unlimited if None." + } + ] + }, + "flags": {}, + "groups": [], + "id": 525, + "module": "_types", + "name": "limit", + "parsedDocstring": { + "text": "The maximum number of items to retrieve. Unlimited if None." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 697 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return only non-empty items and excludes hidden fields. Shortcut for `skip_hidden` and `skip_empty`." + } + ] + }, + "flags": {}, + "groups": [], + "id": 526, + "module": "_types", + "name": "clean", + "parsedDocstring": { + "text": "Return only non-empty items and excludes hidden fields. Shortcut for `skip_hidden` and `skip_empty`." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 700 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Set to True to sort results in descending order." + } + ] + }, + "flags": {}, + "groups": [], + "id": 527, + "module": "_types", + "name": "desc", + "parsedDocstring": { + "text": "Set to True to sort results in descending order." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 703 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Fields to include in each item. Sorts fields as specified if provided." + } + ] + }, + "flags": {}, + "groups": [], + "id": 528, + "module": "_types", + "name": "fields", + "parsedDocstring": { + "text": "Fields to include in each item. Sorts fields as specified if provided." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 706 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Fields to exclude from each item." + } + ] + }, + "flags": {}, + "groups": [], + "id": 529, + "module": "_types", + "name": "omit", + "parsedDocstring": { + "text": "Fields to exclude from each item." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 709 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Unwinds items by a specified array field, turning each element into a separate item." + } + ] + }, + "flags": {}, + "groups": [], + "id": 530, + "module": "_types", + "name": "unwind", + "parsedDocstring": { + "text": "Unwinds items by a specified array field, turning each element into a separate item." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 712 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Excludes empty items from the results if True." + } + ] + }, + "flags": {}, + "groups": [], + "id": 531, + "module": "_types", + "name": "skip_empty", + "parsedDocstring": { + "text": "Excludes empty items from the results if True." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 715 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Excludes fields starting with '#' if True." + } + ] + }, + "flags": {}, + "groups": [], + "id": 532, + "module": "_types", + "name": "skip_hidden", + "parsedDocstring": { + "text": "Excludes fields starting with '#' if True." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 718 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Fields to be flattened in returned items." + } + ] + }, + "flags": {}, + "groups": [], + "id": 533, + "module": "_types", + "name": "flatten", + "parsedDocstring": { + "text": "Fields to be flattened in returned items." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 721 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Specifies the dataset view to be used." + } + ] + }, + "flags": {}, + "groups": [], + "id": 534, + "module": "_types", + "name": "view", + "parsedDocstring": { + "text": "Specifies the dataset view to be used." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 724 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Keyword arguments for dataset's `get_data` method." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 526, + 527, + 528, + 533, + 525, + 524, + 529, + 531, + 532, + 530, + 534 + ], + "title": "Properties" + } + ], + "id": 523, + "module": "_types", + "name": "GetDataKwargs", + "parsedDocstring": { + "text": "Keyword arguments for dataset's `get_data` method." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 691 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The key under which to save the data." + } + ] + }, + "flags": {}, + "groups": [], + "id": 536, + "module": "_types", + "name": "key", + "parsedDocstring": { + "text": "The key under which to save the data." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 731 + } + ], + "type": { + "name": "Required", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The format in which to export the data. Either 'json' or 'csv'." + } + ] + }, + "flags": {}, + "groups": [], + "id": 537, + "module": "_types", + "name": "content_type", + "parsedDocstring": { + "text": "The format in which to export the data. Either 'json' or 'csv'." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 734 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Literal", + "typeArguments": [ + { + "type": "literal", + "value": "json" + }, + { + "type": "literal", + "value": "csv" + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "ID of the key-value store to save the exported file." + } + ] + }, + "flags": {}, + "groups": [], + "id": 538, + "module": "_types", + "name": "to_kvs_id", + "parsedDocstring": { + "text": "ID of the key-value store to save the exported file." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 737 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Name of the key-value store to save the exported file." + } + ] + }, + "flags": {}, + "groups": [], + "id": 539, + "module": "_types", + "name": "to_kvs_name", + "parsedDocstring": { + "text": "Name of the key-value store to save the exported file." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 740 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The storage client to use for saving the exported file." + } + ] + }, + "flags": {}, + "groups": [], + "id": 540, + "module": "_types", + "name": "to_kvs_storage_client", + "parsedDocstring": { + "text": "The storage client to use for saving the exported file." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 743 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "StorageClient", + "target": "2784" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The configuration to use for saving the exported file." + } + ] + }, + "flags": {}, + "groups": [], + "id": 541, + "module": "_types", + "name": "to_kvs_configuration", + "parsedDocstring": { + "text": "The configuration to use for saving the exported file." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 746 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Configuration", + "target": "235" + } + ] + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Keyword arguments for dataset's `export_to` method." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 537, + 536, + 541, + 538, + 539, + 540 + ], + "title": "Properties" + } + ], + "id": 535, + "module": "_types", + "name": "ExportToKwargs", + "parsedDocstring": { + "text": "Keyword arguments for dataset's `export_to` method." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 728 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True (default: False), dict keys that are not of a basic type (str, int, float, bool, None) will be skipped\ninstead of raising a `TypeError`." + } + ] + }, + "flags": {}, + "groups": [], + "id": 543, + "module": "_types", + "name": "skipkeys", + "parsedDocstring": { + "text": "If True (default: False), dict keys that are not of a basic type (str, int, float, bool, None) will be skipped\ninstead of raising a `TypeError`." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 753 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Determines if non-ASCII characters should be escaped in the output JSON string." + } + ] + }, + "flags": {}, + "groups": [], + "id": 544, + "module": "_types", + "name": "ensure_ascii", + "parsedDocstring": { + "text": "Determines if non-ASCII characters should be escaped in the output JSON string." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 757 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If False (default: True), skips the circular reference check for container types. A circular reference will\nresult in a `RecursionError` or worse if unchecked." + } + ] + }, + "flags": {}, + "groups": [], + "id": 545, + "module": "_types", + "name": "check_circular", + "parsedDocstring": { + "text": "If False (default: True), skips the circular reference check for container types. A circular reference will\nresult in a `RecursionError` or worse if unchecked." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 760 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If False (default: True), raises a ValueError for out-of-range float values (nan, inf, -inf) to strictly comply\nwith the JSON specification. If True, uses their JavaScript equivalents (NaN, Infinity, -Infinity)." + } + ] + }, + "flags": {}, + "groups": [], + "id": 546, + "module": "_types", + "name": "allow_nan", + "parsedDocstring": { + "text": "If False (default: True), raises a ValueError for out-of-range float values (nan, inf, -inf) to strictly comply\nwith the JSON specification. If True, uses their JavaScript equivalents (NaN, Infinity, -Infinity)." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 764 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Allows specifying a custom JSON encoder." + } + ] + }, + "flags": {}, + "groups": [], + "id": 547, + "module": "_types", + "name": "cls", + "parsedDocstring": { + "text": "Allows specifying a custom JSON encoder." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 768 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "type", + "typeArguments": [ + { + "type": "reference", + "name": "json.JSONEncoder" + } + ], + "target": "981" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Specifies the number of spaces for indentation in the pretty-printed JSON output." + } + ] + }, + "flags": {}, + "groups": [], + "id": 548, + "module": "_types", + "name": "indent", + "parsedDocstring": { + "text": "Specifies the number of spaces for indentation in the pretty-printed JSON output." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 771 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A tuple of (item_separator, key_separator). The default is (', ', ': ') if indent is None and (',', ': ')\notherwise." + } + ] + }, + "flags": {}, + "groups": [], + "id": 549, + "module": "_types", + "name": "separators", + "parsedDocstring": { + "text": "A tuple of (item_separator, key_separator). The default is (', ', ': ') if indent is None and (',', ': ')\notherwise." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 774 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "tuple", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "str" + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A function called for objects that can't be serialized otherwise. It should return a JSON-encodable version\nof the object or raise a `TypeError`." + } + ] + }, + "flags": {}, + "groups": [], + "id": 550, + "module": "_types", + "name": "default", + "parsedDocstring": { + "text": "A function called for objects that can't be serialized otherwise. It should return a JSON-encodable version\nof the object or raise a `TypeError`." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 778 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Callable" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Specifies whether the output JSON object should have keys sorted alphabetically." + } + ] + }, + "flags": {}, + "groups": [], + "id": 551, + "module": "_types", + "name": "sort_keys", + "parsedDocstring": { + "text": "Specifies whether the output JSON object should have keys sorted alphabetically." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 782 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Keyword arguments for dataset's `export_data_json` method." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 546, + 545, + 547, + 550, + 544, + 548, + 549, + 543, + 551 + ], + "title": "Properties" + } + ], + "id": 542, + "module": "_types", + "name": "ExportDataJsonKwargs", + "parsedDocstring": { + "text": "Keyword arguments for dataset's `export_data_json` method." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 750 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Specifies a dialect to be used in CSV parsing and writing." + } + ] + }, + "flags": {}, + "groups": [], + "id": 553, + "module": "_types", + "name": "dialect", + "parsedDocstring": { + "text": "Specifies a dialect to be used in CSV parsing and writing." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 789 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A one-character string used to separate fields. Defaults to ','." + } + ] + }, + "flags": {}, + "groups": [], + "id": 554, + "module": "_types", + "name": "delimiter", + "parsedDocstring": { + "text": "A one-character string used to separate fields. Defaults to ','." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 792 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Controls how instances of `quotechar` inside a field should be quoted. When True, the character is doubled;\nwhen False, the `escapechar` is used as a prefix. Defaults to True." + } + ] + }, + "flags": {}, + "groups": [], + "id": 555, + "module": "_types", + "name": "doublequote", + "parsedDocstring": { + "text": "Controls how instances of `quotechar` inside a field should be quoted. When True, the character is doubled;\nwhen False, the `escapechar` is used as a prefix. Defaults to True." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 795 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A one-character string used to escape the delimiter if `quoting` is set to `QUOTE_NONE` and the `quotechar`\nif `doublequote` is False. Defaults to None, disabling escaping." + } + ] + }, + "flags": {}, + "groups": [], + "id": 556, + "module": "_types", + "name": "escapechar", + "parsedDocstring": { + "text": "A one-character string used to escape the delimiter if `quoting` is set to `QUOTE_NONE` and the `quotechar`\nif `doublequote` is False. Defaults to None, disabling escaping." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 799 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The string used to terminate lines produced by the writer. Defaults to '\\r\\n'." + } + ] + }, + "flags": {}, + "groups": [], + "id": 557, + "module": "_types", + "name": "lineterminator", + "parsedDocstring": { + "text": "The string used to terminate lines produced by the writer. Defaults to '\\r\\n'." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 803 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A one-character string used to quote fields containing special characters, like the delimiter or quotechar,\nor fields containing new-line characters. Defaults to '\"'." + } + ] + }, + "flags": {}, + "groups": [], + "id": 558, + "module": "_types", + "name": "quotechar", + "parsedDocstring": { + "text": "A one-character string used to quote fields containing special characters, like the delimiter or quotechar,\nor fields containing new-line characters. Defaults to '\"'." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 806 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Controls when quotes should be generated by the writer and recognized by the reader. Can take any of\nthe `QUOTE_*` constants, with a default of `QUOTE_MINIMAL`." + } + ] + }, + "flags": {}, + "groups": [], + "id": 559, + "module": "_types", + "name": "quoting", + "parsedDocstring": { + "text": "Controls when quotes should be generated by the writer and recognized by the reader. Can take any of\nthe `QUOTE_*` constants, with a default of `QUOTE_MINIMAL`." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 810 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "When True, spaces immediately following the delimiter are ignored. Defaults to False." + } + ] + }, + "flags": {}, + "groups": [], + "id": 560, + "module": "_types", + "name": "skipinitialspace", + "parsedDocstring": { + "text": "When True, spaces immediately following the delimiter are ignored. Defaults to False." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 814 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "When True, raises an exception on bad CSV input. Defaults to False." + } + ] + }, + "flags": {}, + "groups": [], + "id": 561, + "module": "_types", + "name": "strict", + "parsedDocstring": { + "text": "When True, raises an exception on bad CSV input. Defaults to False." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 817 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Keyword arguments for dataset's `export_data_csv` method." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 554, + 553, + 555, + 556, + 557, + 558, + 559, + 560, + 561 + ], + "title": "Properties" + } + ], + "id": 552, + "module": "_types", + "name": "ExportDataCsvKwargs", + "parsedDocstring": { + "text": "Keyword arguments for dataset's `export_data_csv` method." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 786 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 562, + "module": "_autoscaling.autoscaled_pool", + "name": "logger", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_autoscaling/autoscaled_pool.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 21 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Raised when an AutoscaledPool run is aborted. Not for direct use." + } + ] + }, + "flags": {}, + "groups": [], + "id": 563, + "module": "_autoscaling.autoscaled_pool", + "name": "AbortError", + "parsedDocstring": { + "text": "Raised when an AutoscaledPool run is aborted. Not for direct use." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_autoscaling/autoscaled_pool.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 24 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 565, + "module": "_autoscaling.autoscaled_pool", + "name": "__init__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_autoscaling/autoscaled_pool.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 29 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 566, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 565 + ], + "title": "Methods" + } + ], + "id": 564, + "module": "_autoscaling.autoscaled_pool", + "name": "_AutoscaledPoolRun", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_autoscaling/autoscaled_pool.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 28 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 568, + "module": "_autoscaling.autoscaled_pool", + "name": "__init__", + "parsedDocstring": { + "text": "Initialize a new instance.\n", + "args": { + "system_status": "Provides data about system utilization (load).", + "concurrency_settings": "Settings of concurrency levels.", + "run_task_function": "A function that performs an asynchronous resource-intensive task.", + "is_task_ready_function": "A function that indicates whether `run_task_function` should be called. This\nfunction is called every time there is free capacity for a new task and it should indicate whether\nit should start a new task or not by resolving to either `True` or `False`. Besides its obvious use,\nit is also useful for task throttling to save resources.", + "is_finished_function": "A function that is called only when there are no tasks to be processed. If it\nresolves to `True` then the pool's run finishes. Being called only when there are no tasks being\nprocessed means that as long as `is_task_ready_function` keeps resolving to `True`,\n`is_finished_function` will never be called. To abort a run, use the `abort` method." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_autoscaling/autoscaled_pool.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 64 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n" + } + ] + }, + "flags": {}, + "id": 569, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Provides data about system utilization (load)." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 570, + "kind": 32768, + "kindString": "Parameter", + "name": "system_status", + "type": { + "name": "SystemStatus", + "type": "reference", + "target": "586" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Settings of concurrency levels." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 571, + "kind": 32768, + "kindString": "Parameter", + "name": "concurrency_settings", + "type": { + "name": "ConcurrencySettings | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ConcurrencySettings", + "target": "328" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A function that performs an asynchronous resource-intensive task." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 572, + "kind": 32768, + "kindString": "Parameter", + "name": "run_task_function", + "type": { + "name": "Callable", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "[]" + }, + { + "type": "reference", + "name": "Awaitable" + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A function that indicates whether `run_task_function` should be called. This\nfunction is called every time there is free capacity for a new task and it should indicate whether\nit should start a new task or not by resolving to either `True` or `False`. Besides its obvious use,\nit is also useful for task throttling to save resources." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 573, + "kind": 32768, + "kindString": "Parameter", + "name": "is_task_ready_function", + "type": { + "name": "Callable", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "[]" + }, + { + "type": "reference", + "name": "Awaitable", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A function that is called only when there are no tasks to be processed. If it\nresolves to `True` then the pool's run finishes. Being called only when there are no tasks being\nprocessed means that as long as `is_task_ready_function` keeps resolving to `True`,\n`is_finished_function` will never be called. To abort a run, use the `abort` method." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 574, + "kind": 32768, + "kindString": "Parameter", + "name": "is_finished_function", + "type": { + "name": "Callable", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "[]" + }, + { + "type": "reference", + "name": "Awaitable", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Start the autoscaled pool and return when all tasks are completed and `is_finished_function` returns True.\n\nIf there is an exception in one of the tasks, it will be re-raised." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 575, + "module": "_autoscaling.autoscaled_pool", + "name": "run", + "parsedDocstring": { + "text": "Start the autoscaled pool and return when all tasks are completed and `is_finished_function` returns True.\n\nIf there is an exception in one of the tasks, it will be re-raised." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_autoscaling/autoscaled_pool.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 105 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Start the autoscaled pool and return when all tasks are completed and `is_finished_function` returns True.\n\nIf there is an exception in one of the tasks, it will be re-raised." + } + ] + }, + "flags": {}, + "id": 576, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "run", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Interrupt the autoscaled pool and all the tasks in progress." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 577, + "module": "_autoscaling.autoscaled_pool", + "name": "abort", + "parsedDocstring": { + "text": "Interrupt the autoscaled pool and all the tasks in progress." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_autoscaling/autoscaled_pool.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 155 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Interrupt the autoscaled pool and all the tasks in progress." + } + ] + }, + "flags": {}, + "id": 578, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "abort", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Pause the autoscaled pool so that it does not start new tasks." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 579, + "module": "_autoscaling.autoscaled_pool", + "name": "pause", + "parsedDocstring": { + "text": "Pause the autoscaled pool so that it does not start new tasks." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_autoscaling/autoscaled_pool.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 163 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Pause the autoscaled pool so that it does not start new tasks." + } + ] + }, + "flags": {}, + "id": 580, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "pause", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Resume a paused autoscaled pool so that it continues starting new tasks." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 581, + "module": "_autoscaling.autoscaled_pool", + "name": "resume", + "parsedDocstring": { + "text": "Resume a paused autoscaled pool so that it continues starting new tasks." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_autoscaling/autoscaled_pool.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 167 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Resume a paused autoscaled pool so that it continues starting new tasks." + } + ] + }, + "flags": {}, + "id": 582, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "resume", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The current desired concurrency, possibly updated by the pool according to system load." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 583, + "module": "_autoscaling.autoscaled_pool", + "name": "desired_concurrency", + "parsedDocstring": { + "text": "The current desired concurrency, possibly updated by the pool according to system load." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_autoscaling/autoscaled_pool.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 172 + } + ], + "type": { + "name": "int", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The number of concurrent tasks in progress." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 584, + "module": "_autoscaling.autoscaled_pool", + "name": "current_concurrency", + "parsedDocstring": { + "text": "The number of concurrent tasks in progress." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_autoscaling/autoscaled_pool.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 177 + } + ], + "type": { + "name": "int", + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Manages a pool of asynchronous resource-intensive tasks that are executed in parallel.\n\nThe pool only starts new tasks if there is enough free CPU and memory available. If an exception is thrown in\nany of the tasks, it is propagated and the pool is stopped." + } + ] + }, + "decorations": [ + { + "args": "('Autoscaling')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 568, + 577, + 579, + 581, + 575 + ], + "title": "Methods" + }, + { + "children": [ + 584, + 583 + ], + "title": "Properties" + } + ], + "id": 567, + "module": "_autoscaling.autoscaled_pool", + "name": "AutoscaledPool", + "parsedDocstring": { + "text": "Manages a pool of asynchronous resource-intensive tasks that are executed in parallel.\n\nThe pool only starts new tasks if there is enough free CPU and memory available. If an exception is thrown in\nany of the tasks, it is propagated and the pool is stopped." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_autoscaling/autoscaled_pool.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 39 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 585, + "module": "_autoscaling.system_status", + "name": "logger", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_autoscaling/system_status.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 17 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 587, + "module": "_autoscaling.system_status", + "name": "__init__", + "parsedDocstring": { + "text": "Initialize a new instance.\n", + "args": { + "snapshotter": "The `Snapshotter` instance to be queried for `SystemStatus`.", + "max_snapshot_age": "Defines max age of snapshots used in the `SystemStatus.get_current_system_info`\nmeasurement.", + "cpu_overload_threshold": "Sets the threshold of overloaded snapshots in the CPU sample.\nIf the sample exceeds this threshold, the system will be considered overloaded.", + "memory_overload_threshold": "Sets the threshold of overloaded snapshots in the memory sample.\nIf the sample exceeds this threshold, the system will be considered overloaded.", + "event_loop_overload_threshold": "Sets the threshold of overloaded snapshots in the event loop sample.\nIf the sample exceeds this threshold, the system will be considered overloaded.", + "client_overload_threshold": "Sets the threshold of overloaded snapshots in the Client sample.\nIf the sample exceeds this threshold, the system will be considered overloaded." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_autoscaling/system_status.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 39 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n" + } + ] + }, + "flags": {}, + "id": 588, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The `Snapshotter` instance to be queried for `SystemStatus`." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 589, + "kind": 32768, + "kindString": "Parameter", + "name": "snapshotter", + "type": { + "name": "Snapshotter", + "type": "reference", + "target": "647" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Defines max age of snapshots used in the `SystemStatus.get_current_system_info`\nmeasurement." + } + ] + }, + "defaultValue": "timedelta(seconds=5)", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 590, + "kind": 32768, + "kindString": "Parameter", + "name": "max_snapshot_age", + "type": { + "name": "timedelta", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Sets the threshold of overloaded snapshots in the CPU sample.\nIf the sample exceeds this threshold, the system will be considered overloaded." + } + ] + }, + "defaultValue": "0.4", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 591, + "kind": 32768, + "kindString": "Parameter", + "name": "cpu_overload_threshold", + "type": { + "name": "float", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Sets the threshold of overloaded snapshots in the memory sample.\nIf the sample exceeds this threshold, the system will be considered overloaded." + } + ] + }, + "defaultValue": "0.2", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 592, + "kind": 32768, + "kindString": "Parameter", + "name": "memory_overload_threshold", + "type": { + "name": "float", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Sets the threshold of overloaded snapshots in the event loop sample.\nIf the sample exceeds this threshold, the system will be considered overloaded." + } + ] + }, + "defaultValue": "0.6", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 593, + "kind": 32768, + "kindString": "Parameter", + "name": "event_loop_overload_threshold", + "type": { + "name": "float", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Sets the threshold of overloaded snapshots in the Client sample.\nIf the sample exceeds this threshold, the system will be considered overloaded." + } + ] + }, + "defaultValue": "0.3", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 594, + "kind": 32768, + "kindString": "Parameter", + "name": "client_overload_threshold", + "type": { + "name": "float", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Retrieve and evaluates the current status of system resources.\n\nConsiders snapshots within the `_max_snapshot_age` timeframe and determines if the system is currently\noverloaded based on predefined thresholds for each resource type.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 595, + "module": "_autoscaling.system_status", + "name": "get_current_system_info", + "parsedDocstring": { + "text": "Retrieve and evaluates the current status of system resources.\n\nConsiders snapshots within the `_max_snapshot_age` timeframe and determines if the system is currently\noverloaded based on predefined thresholds for each resource type.\n", + "returns": "An object representing the current system status." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_autoscaling/system_status.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 71 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "An object representing the current system status." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Retrieve and evaluates the current status of system resources.\n\nConsiders snapshots within the `_max_snapshot_age` timeframe and determines if the system is currently\noverloaded based on predefined thresholds for each resource type.\n" + } + ] + }, + "flags": {}, + "id": 596, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "get_current_system_info", + "parameters": [], + "type": { + "name": "SystemInfo", + "type": "reference", + "target": "604" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Retrieve and evaluates the historical status of system resources.\n\nConsiders the entire history of snapshots from the Snapshotter to assess long-term system performance and\ndetermines if the system has been historically overloaded.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 597, + "module": "_autoscaling.system_status", + "name": "get_historical_system_info", + "parsedDocstring": { + "text": "Retrieve and evaluates the historical status of system resources.\n\nConsiders the entire history of snapshots from the Snapshotter to assess long-term system performance and\ndetermines if the system has been historically overloaded.\n", + "returns": "An object representing the historical system status." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_autoscaling/system_status.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 82 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "An object representing the historical system status." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Retrieve and evaluates the historical status of system resources.\n\nConsiders the entire history of snapshots from the Snapshotter to assess long-term system performance and\ndetermines if the system has been historically overloaded.\n" + } + ] + }, + "flags": {}, + "id": 598, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "get_historical_system_info", + "parameters": [], + "type": { + "name": "SystemInfo", + "type": "reference", + "target": "604" + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Provides a simple interface for evaluating system resource usage from snapshots collected by `Snapshotter`.\n\nThis class aggregates and interprets snapshots from a Snapshotter instance to evaluate the current and historical\nstatus of system resources like CPU, memory, event loop, and client API usage. It exposes two methods\n`get_current_system_info` and `get_historical_system_info`. The system information is computed using a weighted\naverage of overloaded messages in the snapshots, with the weights being the time intervals between the snapshots.\nEach resource is computed separately, and the system is considered as overloaded whenever at least one resource\nis overloaded.\n\n`get_current_system_info` returns a `SystemInfo` data structure that represents the current status\nof the system. The length of the current timeframe in seconds is configurable by the `max_snapshot_age` option\nand represents the max age of snapshots to be considered for the computation.\n\n`SystemStatus.get_historical_system_info` returns a `SystemInfo` that represents the long-term status of the system.\nIt considers the full snapshot history available in the `Snapshotter` instance." + } + ] + }, + "decorations": [ + { + "args": "('Autoscaling')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 587, + 595, + 597 + ], + "title": "Methods" + } + ], + "id": 586, + "module": "_autoscaling.system_status", + "name": "SystemStatus", + "parsedDocstring": { + "text": "Provides a simple interface for evaluating system resource usage from snapshots collected by `Snapshotter`.\n\nThis class aggregates and interprets snapshots from a Snapshotter instance to evaluate the current and historical\nstatus of system resources like CPU, memory, event loop, and client API usage. It exposes two methods\n`get_current_system_info` and `get_historical_system_info`. The system information is computed using a weighted\naverage of overloaded messages in the snapshots, with the weights being the time intervals between the snapshots.\nEach resource is computed separately, and the system is considered as overloaded whenever at least one resource\nis overloaded.\n\n`get_current_system_info` returns a `SystemInfo` data structure that represents the current status\nof the system. The length of the current timeframe in seconds is configurable by the `max_snapshot_age` option\nand represents the max age of snapshots to be considered for the computation.\n\n`SystemStatus.get_historical_system_info` returns a `SystemInfo` that represents the long-term status of the system.\nIt considers the full snapshot history available in the `Snapshotter` instance." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_autoscaling/system_status.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 21 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 599, + "module": "_autoscaling._types", + "name": "SYSTEM_WIDE_MEMORY_OVERLOAD_THRESHOLD", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_autoscaling/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 13 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The maximum ratio of overloaded and non-overloaded samples. If the actual ratio exceeds this value,\nthe resource is considered as overloaded." + } + ] + }, + "flags": {}, + "groups": [], + "id": 601, + "module": "_autoscaling._types", + "name": "limit_ratio", + "parsedDocstring": { + "text": "The maximum ratio of overloaded and non-overloaded samples. If the actual ratio exceeds this value,\nthe resource is considered as overloaded." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_autoscaling/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 20 + } + ], + "type": { + "name": "float", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The actual ratio of overloaded and non-overloaded samples." + } + ] + }, + "flags": {}, + "groups": [], + "id": 602, + "module": "_autoscaling._types", + "name": "actual_ratio", + "parsedDocstring": { + "text": "The actual ratio of overloaded and non-overloaded samples." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_autoscaling/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 24 + } + ], + "type": { + "name": "float", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Indicate whether the resource is currently overloaded." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 603, + "module": "_autoscaling._types", + "name": "is_overloaded", + "parsedDocstring": { + "text": "Indicate whether the resource is currently overloaded." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_autoscaling/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 28 + } + ], + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Represent the load ratio of a resource." + } + ] + }, + "decorations": [ + { + "name": "dataclass" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 602, + 603, + 601 + ], + "title": "Properties" + } + ], + "id": 600, + "module": "_autoscaling._types", + "name": "LoadRatioInfo", + "parsedDocstring": { + "text": "Represent the load ratio of a resource." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_autoscaling/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 17 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The CPU load ratio." + } + ] + }, + "flags": {}, + "groups": [], + "id": 605, + "module": "_autoscaling._types", + "name": "cpu_info", + "parsedDocstring": { + "text": "The CPU load ratio." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_autoscaling/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 37 + } + ], + "type": { + "name": "LoadRatioInfo", + "type": "reference", + "target": "600" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The memory load ratio." + } + ] + }, + "flags": {}, + "groups": [], + "id": 606, + "module": "_autoscaling._types", + "name": "memory_info", + "parsedDocstring": { + "text": "The memory load ratio." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_autoscaling/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 40 + } + ], + "type": { + "name": "LoadRatioInfo", + "type": "reference", + "target": "600" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The event loop load ratio." + } + ] + }, + "flags": {}, + "groups": [], + "id": 607, + "module": "_autoscaling._types", + "name": "event_loop_info", + "parsedDocstring": { + "text": "The event loop load ratio." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_autoscaling/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 43 + } + ], + "type": { + "name": "LoadRatioInfo", + "type": "reference", + "target": "600" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The client load ratio." + } + ] + }, + "flags": {}, + "groups": [], + "id": 608, + "module": "_autoscaling._types", + "name": "client_info", + "parsedDocstring": { + "text": "The client load ratio." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_autoscaling/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 46 + } + ], + "type": { + "name": "LoadRatioInfo", + "type": "reference", + "target": "600" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The time at which the system load information was measured." + } + ] + }, + "flags": {}, + "groups": [], + "id": 609, + "module": "_autoscaling._types", + "name": "created_at", + "parsedDocstring": { + "text": "The time at which the system load information was measured." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_autoscaling/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 49 + } + ], + "type": { + "name": "datetime", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Indicate whether the system is currently idle or overloaded." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 610, + "module": "_autoscaling._types", + "name": "is_system_idle", + "parsedDocstring": { + "text": "Indicate whether the system is currently idle or overloaded." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_autoscaling/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 53 + } + ], + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get a string representation of the system info." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 611, + "module": "_autoscaling._types", + "name": "__str__", + "parsedDocstring": { + "text": "Get a string representation of the system info." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_autoscaling/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 62 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get a string representation of the system info." + } + ] + }, + "flags": {}, + "id": 612, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__str__", + "parameters": [], + "type": { + "name": "str", + "type": "reference" + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Represent the current status of the system." + } + ] + }, + "decorations": [ + { + "name": "dataclass" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 611 + ], + "title": "Methods" + }, + { + "children": [ + 608, + 605, + 609, + 607, + 610, + 606 + ], + "title": "Properties" + } + ], + "id": 604, + "module": "_autoscaling._types", + "name": "SystemInfo", + "parsedDocstring": { + "text": "Represent the current status of the system." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_autoscaling/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 34 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The ratio of CPU currently in use." + } + ] + }, + "flags": {}, + "groups": [], + "id": 614, + "module": "_autoscaling._types", + "name": "used_ratio", + "parsedDocstring": { + "text": "The ratio of CPU currently in use." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_autoscaling/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 77 + } + ], + "type": { + "name": "float", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The maximum ratio of CPU that is considered acceptable." + } + ] + }, + "flags": {}, + "groups": [], + "id": 615, + "module": "_autoscaling._types", + "name": "max_used_ratio", + "parsedDocstring": { + "text": "The maximum ratio of CPU that is considered acceptable." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_autoscaling/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 80 + } + ], + "type": { + "name": "float", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The time at which the system load information was measured." + } + ] + }, + "flags": {}, + "groups": [], + "id": 616, + "module": "_autoscaling._types", + "name": "created_at", + "parsedDocstring": { + "text": "The time at which the system load information was measured." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_autoscaling/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 83 + } + ], + "type": { + "name": "datetime", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Indicate whether the CPU is considered as overloaded." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 617, + "module": "_autoscaling._types", + "name": "is_overloaded", + "parsedDocstring": { + "text": "Indicate whether the CPU is considered as overloaded." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_autoscaling/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 87 + } + ], + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A snapshot of CPU usage." + } + ] + }, + "decorations": [ + { + "name": "dataclass" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 616, + 617, + 615, + 614 + ], + "title": "Properties" + } + ], + "id": 613, + "module": "_autoscaling._types", + "name": "CpuSnapshot", + "parsedDocstring": { + "text": "A snapshot of CPU usage." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_autoscaling/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 74 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Memory usage of the current Python process and its children." + } + ] + }, + "flags": {}, + "groups": [], + "id": 619, + "module": "_autoscaling._types", + "name": "current_size", + "parsedDocstring": { + "text": "Memory usage of the current Python process and its children." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_autoscaling/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 96 + } + ], + "type": { + "name": "ByteSize", + "type": "reference", + "target": "681" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Memory usage of all processes, system-wide." + } + ] + }, + "flags": {}, + "groups": [], + "id": 620, + "module": "_autoscaling._types", + "name": "system_wide_used_size", + "parsedDocstring": { + "text": "Memory usage of all processes, system-wide." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_autoscaling/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 99 + } + ], + "type": { + "name": "ByteSize | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ByteSize", + "target": "681" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The maximum memory that can be used by `AutoscaledPool`." + } + ] + }, + "flags": {}, + "groups": [], + "id": 621, + "module": "_autoscaling._types", + "name": "max_memory_size", + "parsedDocstring": { + "text": "The maximum memory that can be used by `AutoscaledPool`." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_autoscaling/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 102 + } + ], + "type": { + "name": "ByteSize", + "type": "reference", + "target": "681" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Total memory available in the whole system." + } + ] + }, + "flags": {}, + "groups": [], + "id": 622, + "module": "_autoscaling._types", + "name": "system_wide_memory_size", + "parsedDocstring": { + "text": "Total memory available in the whole system." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_autoscaling/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 105 + } + ], + "type": { + "name": "ByteSize | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ByteSize", + "target": "681" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The maximum acceptable ratio of `current_size` to `max_memory_size`." + } + ] + }, + "flags": {}, + "groups": [], + "id": 623, + "module": "_autoscaling._types", + "name": "max_used_memory_ratio", + "parsedDocstring": { + "text": "The maximum acceptable ratio of `current_size` to `max_memory_size`." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_autoscaling/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 108 + } + ], + "type": { + "name": "float", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The time at which the system load information was measured." + } + ] + }, + "flags": {}, + "groups": [], + "id": 624, + "module": "_autoscaling._types", + "name": "created_at", + "parsedDocstring": { + "text": "The time at which the system load information was measured." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_autoscaling/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 111 + } + ], + "type": { + "name": "datetime", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Indicate whether the memory is considered as overloaded." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 625, + "module": "_autoscaling._types", + "name": "is_overloaded", + "parsedDocstring": { + "text": "Indicate whether the memory is considered as overloaded." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_autoscaling/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 115 + } + ], + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A snapshot of memory usage." + } + ] + }, + "decorations": [ + { + "name": "dataclass" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 624, + 619, + 625, + 621, + 623, + 622, + 620 + ], + "title": "Properties" + } + ], + "id": 618, + "module": "_autoscaling._types", + "name": "MemorySnapshot", + "parsedDocstring": { + "text": "A snapshot of memory usage." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_autoscaling/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 93 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The current delay of the event loop." + } + ] + }, + "flags": {}, + "groups": [], + "id": 627, + "module": "_autoscaling._types", + "name": "delay", + "parsedDocstring": { + "text": "The current delay of the event loop." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_autoscaling/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 129 + } + ], + "type": { + "name": "timedelta", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The maximum delay that is considered acceptable." + } + ] + }, + "flags": {}, + "groups": [], + "id": 628, + "module": "_autoscaling._types", + "name": "max_delay", + "parsedDocstring": { + "text": "The maximum delay that is considered acceptable." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_autoscaling/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 132 + } + ], + "type": { + "name": "timedelta", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The time at which the system load information was measured." + } + ] + }, + "flags": {}, + "groups": [], + "id": 629, + "module": "_autoscaling._types", + "name": "created_at", + "parsedDocstring": { + "text": "The time at which the system load information was measured." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_autoscaling/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 135 + } + ], + "type": { + "name": "datetime", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The amount of time by which the delay exceeds the maximum delay." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 630, + "module": "_autoscaling._types", + "name": "max_delay_exceeded", + "parsedDocstring": { + "text": "The amount of time by which the delay exceeds the maximum delay." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_autoscaling/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 139 + } + ], + "type": { + "name": "timedelta", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Indicate whether the event loop is considered as overloaded." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 631, + "module": "_autoscaling._types", + "name": "is_overloaded", + "parsedDocstring": { + "text": "Indicate whether the event loop is considered as overloaded." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_autoscaling/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 144 + } + ], + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Snapshot of the state of the event loop." + } + ] + }, + "decorations": [ + { + "name": "dataclass" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 629, + 627, + 631, + 628, + 630 + ], + "title": "Properties" + } + ], + "id": 626, + "module": "_autoscaling._types", + "name": "EventLoopSnapshot", + "parsedDocstring": { + "text": "Snapshot of the state of the event loop." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_autoscaling/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 126 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The number of errors (HTTP 429) that occurred." + } + ] + }, + "flags": {}, + "groups": [], + "id": 633, + "module": "_autoscaling._types", + "name": "error_count", + "parsedDocstring": { + "text": "The number of errors (HTTP 429) that occurred." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_autoscaling/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 153 + } + ], + "type": { + "name": "int", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The number of new errors (HTTP 429) that occurred since the last snapshot." + } + ] + }, + "flags": {}, + "groups": [], + "id": 634, + "module": "_autoscaling._types", + "name": "new_error_count", + "parsedDocstring": { + "text": "The number of new errors (HTTP 429) that occurred since the last snapshot." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_autoscaling/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 156 + } + ], + "type": { + "name": "int", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The maximum number of errors that is considered acceptable." + } + ] + }, + "flags": {}, + "groups": [], + "id": 635, + "module": "_autoscaling._types", + "name": "max_error_count", + "parsedDocstring": { + "text": "The maximum number of errors that is considered acceptable." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_autoscaling/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 159 + } + ], + "type": { + "name": "int", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The time at which the system load information was measured." + } + ] + }, + "flags": {}, + "groups": [], + "id": 636, + "module": "_autoscaling._types", + "name": "created_at", + "parsedDocstring": { + "text": "The time at which the system load information was measured." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_autoscaling/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 162 + } + ], + "type": { + "name": "datetime", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Indicate whether the client is considered as overloaded." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 637, + "module": "_autoscaling._types", + "name": "is_overloaded", + "parsedDocstring": { + "text": "Indicate whether the client is considered as overloaded." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_autoscaling/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 166 + } + ], + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Snapshot of the state of the client." + } + ] + }, + "decorations": [ + { + "name": "dataclass" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 636, + 633, + 637, + 635, + 634 + ], + "title": "Properties" + } + ], + "id": 632, + "module": "_autoscaling._types", + "name": "ClientSnapshot", + "parsedDocstring": { + "text": "Snapshot of the state of the client." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_autoscaling/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 150 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 638, + "module": "_autoscaling._types", + "name": "Snapshot", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_autoscaling/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 171 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 640, + "module": "_autoscaling._types", + "name": "value", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_autoscaling/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 178 + } + ], + "type": { + "name": "float", + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Represents ratio of memory." + } + ] + }, + "decorations": [ + { + "name": "pydantic_dataclass" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 640 + ], + "title": "Properties" + } + ], + "id": 639, + "module": "_autoscaling._types", + "name": "Ratio", + "parsedDocstring": { + "text": "Represents ratio of memory." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_autoscaling/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 175 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 641, + "module": "_autoscaling.snapshotter", + "name": "logger", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_autoscaling/snapshotter.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 25 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 642, + "module": "_autoscaling.snapshotter", + "name": "T", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_autoscaling/snapshotter.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 27 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Add an item to the list maintaining sorted order by `created_at` using binary search." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 644, + "module": "_autoscaling.snapshotter", + "name": "add", + "parsedDocstring": { + "text": "Add an item to the list maintaining sorted order by `created_at` using binary search." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_autoscaling/snapshotter.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 39 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Add an item to the list maintaining sorted order by `created_at` using binary search." + } + ] + }, + "flags": {}, + "id": 645, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "add", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 646, + "kind": 32768, + "kindString": "Parameter", + "name": "item", + "type": { + "name": "T", + "type": "reference", + "target": "299" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A list that maintains sorted order by `created_at` attribute for snapshot objects." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 644 + ], + "title": "Methods" + } + ], + "id": 643, + "module": "_autoscaling.snapshotter", + "name": "SortedSnapshotList", + "parsedDocstring": { + "text": "A list that maintains sorted order by `created_at` attribute for snapshot objects." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_autoscaling/snapshotter.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 36 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n\nIn most cases, you should use the `from_config` constructor to create a new instance based on\nthe provided configuration.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 648, + "module": "_autoscaling.snapshotter", + "name": "__init__", + "parsedDocstring": { + "text": "Initialize a new instance.\n\nIn most cases, you should use the `from_config` constructor to create a new instance based on\nthe provided configuration.\n", + "args": { + "max_used_cpu_ratio": "Sets the ratio, defining the maximum CPU usage. When the CPU usage is higher than\nthe provided ratio, the CPU is considered overloaded.", + "max_used_memory_ratio": "Sets the ratio, defining the maximum ratio of memory usage. When the memory usage\nis higher than the provided ratio of `max_memory_size`, the memory is considered overloaded.", + "max_event_loop_delay": "Sets the maximum delay of the event loop. When the delay is higher than the provided\nvalue, the event loop is considered overloaded.", + "max_client_errors": "Sets the maximum number of client errors (HTTP 429). When the number of client errors\nis higher than the provided number, the client is considered overloaded.", + "max_memory_size": "Sets the maximum amount of system memory to be used by the `AutoscaledPool`. When of type\n`ByteSize` then it is used as fixed memory size. When of type `Ratio` then it allows for dynamic memory\nscaling based on the available system memory." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_autoscaling/snapshotter.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 72 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n\nIn most cases, you should use the `from_config` constructor to create a new instance based on\nthe provided configuration.\n" + } + ] + }, + "flags": {}, + "id": 649, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Sets the ratio, defining the maximum CPU usage. When the CPU usage is higher than\nthe provided ratio, the CPU is considered overloaded." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 650, + "kind": 32768, + "kindString": "Parameter", + "name": "max_used_cpu_ratio", + "type": { + "name": "float", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Sets the ratio, defining the maximum ratio of memory usage. When the memory usage\nis higher than the provided ratio of `max_memory_size`, the memory is considered overloaded." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 651, + "kind": 32768, + "kindString": "Parameter", + "name": "max_used_memory_ratio", + "type": { + "name": "float", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Sets the maximum delay of the event loop. When the delay is higher than the provided\nvalue, the event loop is considered overloaded." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 652, + "kind": 32768, + "kindString": "Parameter", + "name": "max_event_loop_delay", + "type": { + "name": "timedelta", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Sets the maximum number of client errors (HTTP 429). When the number of client errors\nis higher than the provided number, the client is considered overloaded." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 653, + "kind": 32768, + "kindString": "Parameter", + "name": "max_client_errors", + "type": { + "name": "int", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Sets the maximum amount of system memory to be used by the `AutoscaledPool`. When of type\n`ByteSize` then it is used as fixed memory size. When of type `Ratio` then it allows for dynamic memory\nscaling based on the available system memory." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 654, + "kind": 32768, + "kindString": "Parameter", + "name": "max_memory_size", + "type": { + "name": "ByteSize | Ratio", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ByteSize", + "target": "681" + }, + { + "type": "reference", + "name": "Ratio", + "target": "639" + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance based on the provided `Configuration`.\n" + } + ] + }, + "decorations": [ + { + "name": "classmethod" + } + ], + "flags": {}, + "groups": [], + "id": 655, + "module": "_autoscaling.snapshotter", + "name": "from_config", + "parsedDocstring": { + "text": "Initialize a new instance based on the provided `Configuration`.\n", + "args": { + "config": "The `Configuration` instance. Uses the global (default) one if not provided." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_autoscaling/snapshotter.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 119 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance based on the provided `Configuration`.\n" + } + ] + }, + "flags": {}, + "id": 656, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "from_config", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The `Configuration` instance. Uses the global (default) one if not provided." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 657, + "kind": 32768, + "kindString": "Parameter", + "name": "config", + "type": { + "name": "Configuration | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Configuration", + "target": "235" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "Snapshotter", + "type": "reference", + "target": "647" + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Indicate whether the context is active." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 658, + "module": "_autoscaling.snapshotter", + "name": "active", + "parsedDocstring": { + "text": "Indicate whether the context is active." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_autoscaling/snapshotter.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 155 + } + ], + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Start capturing snapshots at configured intervals.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 659, + "module": "_autoscaling.snapshotter", + "name": "__aenter__", + "parsedDocstring": { + "text": "Start capturing snapshots at configured intervals.\n" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_autoscaling/snapshotter.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 159 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Start capturing snapshots at configured intervals.\n" + } + ] + }, + "flags": {}, + "id": 660, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "__aenter__", + "parameters": [], + "type": { + "name": "Snapshotter", + "type": "reference", + "target": "647" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Stop all resource capturing.\n\nThis method stops capturing snapshots of system resources (CPU, memory, event loop, and client information).\nIt should be called to terminate resource capturing when it is no longer needed.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 661, + "module": "_autoscaling.snapshotter", + "name": "__aexit__", + "parsedDocstring": { + "text": "Stop all resource capturing.\n\nThis method stops capturing snapshots of system resources (CPU, memory, event loop, and client information).\nIt should be called to terminate resource capturing when it is no longer needed.\n" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_autoscaling/snapshotter.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 176 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Stop all resource capturing.\n\nThis method stops capturing snapshots of system resources (CPU, memory, event loop, and client information).\nIt should be called to terminate resource capturing when it is no longer needed.\n" + } + ] + }, + "flags": {}, + "id": 662, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "__aexit__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 663, + "kind": 32768, + "kindString": "Parameter", + "name": "exc_type", + "type": { + "name": "type[BaseException] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "type", + "typeArguments": [ + { + "type": "reference", + "name": "BaseException" + } + ], + "target": "981" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 664, + "kind": 32768, + "kindString": "Parameter", + "name": "exc_value", + "type": { + "name": "BaseException | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "BaseException" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 665, + "kind": 32768, + "kindString": "Parameter", + "name": "exc_traceback", + "type": { + "name": "TracebackType | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "TracebackType" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return a sample of the latest memory snapshots.\n" + } + ] + }, + "decorations": [ + { + "name": "ensure_context" + } + ], + "flags": {}, + "groups": [], + "id": 666, + "module": "_autoscaling.snapshotter", + "name": "get_memory_sample", + "parsedDocstring": { + "text": "Return a sample of the latest memory snapshots.\n", + "args": { + "duration": "The duration of the sample from the latest snapshot. If omitted, it returns a full history.\n" + }, + "returns": "A sample of memory snapshots." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_autoscaling/snapshotter.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 201 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "A sample of memory snapshots." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Return a sample of the latest memory snapshots.\n" + } + ] + }, + "flags": {}, + "id": 667, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "get_memory_sample", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The duration of the sample from the latest snapshot. If omitted, it returns a full history.\n" + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 668, + "kind": 32768, + "kindString": "Parameter", + "name": "duration", + "type": { + "name": "timedelta | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "timedelta" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "list", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Snapshot", + "target": "638" + } + ] + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return a sample of the latest event loop snapshots.\n" + } + ] + }, + "decorations": [ + { + "name": "ensure_context" + } + ], + "flags": {}, + "groups": [], + "id": 669, + "module": "_autoscaling.snapshotter", + "name": "get_event_loop_sample", + "parsedDocstring": { + "text": "Return a sample of the latest event loop snapshots.\n", + "args": { + "duration": "The duration of the sample from the latest snapshot. If omitted, it returns a full history.\n" + }, + "returns": "A sample of event loop snapshots." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_autoscaling/snapshotter.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 214 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "A sample of event loop snapshots." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Return a sample of the latest event loop snapshots.\n" + } + ] + }, + "flags": {}, + "id": 670, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "get_event_loop_sample", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The duration of the sample from the latest snapshot. If omitted, it returns a full history.\n" + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 671, + "kind": 32768, + "kindString": "Parameter", + "name": "duration", + "type": { + "name": "timedelta | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "timedelta" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "list", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Snapshot", + "target": "638" + } + ] + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return a sample of the latest CPU snapshots.\n" + } + ] + }, + "decorations": [ + { + "name": "ensure_context" + } + ], + "flags": {}, + "groups": [], + "id": 672, + "module": "_autoscaling.snapshotter", + "name": "get_cpu_sample", + "parsedDocstring": { + "text": "Return a sample of the latest CPU snapshots.\n", + "args": { + "duration": "The duration of the sample from the latest snapshot. If omitted, it returns a full history.\n" + }, + "returns": "A sample of CPU snapshots." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_autoscaling/snapshotter.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 227 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "A sample of CPU snapshots." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Return a sample of the latest CPU snapshots.\n" + } + ] + }, + "flags": {}, + "id": 673, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "get_cpu_sample", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The duration of the sample from the latest snapshot. If omitted, it returns a full history.\n" + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 674, + "kind": 32768, + "kindString": "Parameter", + "name": "duration", + "type": { + "name": "timedelta | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "timedelta" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "list", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Snapshot", + "target": "638" + } + ] + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return a sample of the latest client snapshots.\n" + } + ] + }, + "decorations": [ + { + "name": "ensure_context" + } + ], + "flags": {}, + "groups": [], + "id": 675, + "module": "_autoscaling.snapshotter", + "name": "get_client_sample", + "parsedDocstring": { + "text": "Return a sample of the latest client snapshots.\n", + "args": { + "duration": "The duration of the sample from the latest snapshot. If omitted, it returns a full history.\n" + }, + "returns": "A sample of client snapshots." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_autoscaling/snapshotter.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 240 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "A sample of client snapshots." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Return a sample of the latest client snapshots.\n" + } + ] + }, + "flags": {}, + "id": 676, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "get_client_sample", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The duration of the sample from the latest snapshot. If omitted, it returns a full history.\n" + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 677, + "kind": 32768, + "kindString": "Parameter", + "name": "duration", + "type": { + "name": "timedelta | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "timedelta" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "list", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Snapshot", + "target": "638" + } + ] + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Monitors and logs system resource usage at predefined intervals for performance optimization.\n\nThe class monitors and records the state of various system resources (CPU, memory, event loop, and client API)\nat predefined intervals. This continuous monitoring helps in identifying resource overloads and ensuring optimal\nperformance of the application. It is utilized in the `AutoscaledPool` module to adjust task allocation\ndynamically based on the current demand and system load." + } + ] + }, + "decorations": [ + { + "args": "('Autoscaling')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 659, + 661, + 648, + 655, + 675, + 672, + 669, + 666 + ], + "title": "Methods" + }, + { + "children": [ + 658 + ], + "title": "Properties" + } + ], + "id": 647, + "module": "_autoscaling.snapshotter", + "name": "Snapshotter", + "parsedDocstring": { + "text": "Monitors and logs system resource usage at predefined intervals for performance optimization.\n\nThe class monitors and records the state of various system resources (CPU, memory, event loop, and client API)\nat predefined intervals. This continuous monitoring helps in identifying resource overloads and ensuring optimal\nperformance of the application. It is utilized in the `AutoscaledPool` module to adjust task allocation\ndynamically based on the current demand and system load." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_autoscaling/snapshotter.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 45 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 678, + "module": "_utils.blocked", + "name": "CLOUDFLARE_RETRY_CSS_SELECTORS", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/blocked.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 5 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "CSS selectors for elements that should trigger a retry, as the crawler is likely getting blocked." + } + ] + }, + "flags": {}, + "groups": [], + "id": 679, + "module": "_utils.blocked", + "name": "RETRY_CSS_SELECTORS", + "parsedDocstring": { + "text": "CSS selectors for elements that should trigger a retry, as the crawler is likely getting blocked." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/blocked.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 9 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Content of proxy errors that should trigger a retry, as the proxy is likely getting blocked / is malfunctioning." + } + ] + }, + "flags": {}, + "groups": [], + "id": 680, + "module": "_utils.blocked", + "name": "ROTATE_PROXY_ERRORS", + "parsedDocstring": { + "text": "Content of proxy errors that should trigger a retry, as the proxy is likely getting blocked / is malfunctioning." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/blocked.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 18 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 682, + "module": "_utils.byte_size", + "name": "bytes", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/byte_size.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 16 + } + ], + "type": { + "name": "int", + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 683, + "module": "_utils.byte_size", + "name": "__post_init__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/byte_size.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 18 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 684, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__post_init__", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "name": "classmethod" + } + ], + "flags": {}, + "groups": [], + "id": 685, + "module": "_utils.byte_size", + "name": "validate", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/byte_size.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 23 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 686, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "validate", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 687, + "kind": 32768, + "kindString": "Parameter", + "name": "value", + "type": { + "name": "Any", + "type": "reference" + } + } + ], + "type": { + "name": "ByteSize", + "type": "reference", + "target": "681" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "name": "classmethod" + } + ], + "flags": {}, + "groups": [], + "id": 688, + "module": "_utils.byte_size", + "name": "from_kb", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/byte_size.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 33 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 689, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "from_kb", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 690, + "kind": 32768, + "kindString": "Parameter", + "name": "kb", + "type": { + "name": "float", + "type": "reference" + } + } + ], + "type": { + "name": "ByteSize", + "type": "reference", + "target": "681" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "name": "classmethod" + } + ], + "flags": {}, + "groups": [], + "id": 691, + "module": "_utils.byte_size", + "name": "from_mb", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/byte_size.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 37 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 692, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "from_mb", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 693, + "kind": 32768, + "kindString": "Parameter", + "name": "mb", + "type": { + "name": "float", + "type": "reference" + } + } + ], + "type": { + "name": "ByteSize", + "type": "reference", + "target": "681" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "name": "classmethod" + } + ], + "flags": {}, + "groups": [], + "id": 694, + "module": "_utils.byte_size", + "name": "from_gb", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/byte_size.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 41 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 695, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "from_gb", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 696, + "kind": 32768, + "kindString": "Parameter", + "name": "gb", + "type": { + "name": "float", + "type": "reference" + } + } + ], + "type": { + "name": "ByteSize", + "type": "reference", + "target": "681" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "name": "classmethod" + } + ], + "flags": {}, + "groups": [], + "id": 697, + "module": "_utils.byte_size", + "name": "from_tb", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/byte_size.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 45 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 698, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "from_tb", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 699, + "kind": 32768, + "kindString": "Parameter", + "name": "tb", + "type": { + "name": "float", + "type": "reference" + } + } + ], + "type": { + "name": "ByteSize", + "type": "reference", + "target": "681" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 700, + "module": "_utils.byte_size", + "name": "to_kb", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/byte_size.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 48 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 701, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "to_kb", + "parameters": [], + "type": { + "name": "float", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 702, + "module": "_utils.byte_size", + "name": "to_mb", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/byte_size.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 51 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 703, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "to_mb", + "parameters": [], + "type": { + "name": "float", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 704, + "module": "_utils.byte_size", + "name": "to_gb", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/byte_size.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 54 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 705, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "to_gb", + "parameters": [], + "type": { + "name": "float", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 706, + "module": "_utils.byte_size", + "name": "to_tb", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/byte_size.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 57 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 707, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "to_tb", + "parameters": [], + "type": { + "name": "float", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 708, + "module": "_utils.byte_size", + "name": "__str__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/byte_size.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 60 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 709, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__str__", + "parameters": [], + "type": { + "name": "str", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 710, + "module": "_utils.byte_size", + "name": "__eq__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/byte_size.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 71 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 711, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__eq__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 712, + "kind": 32768, + "kindString": "Parameter", + "name": "other", + "type": { + "name": "object", + "type": "reference" + } + } + ], + "type": { + "name": "bool", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return hash based on the bytes value." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 713, + "module": "_utils.byte_size", + "name": "__hash__", + "parsedDocstring": { + "text": "Return hash based on the bytes value." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/byte_size.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 76 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return hash based on the bytes value." + } + ] + }, + "flags": {}, + "id": 714, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__hash__", + "parameters": [], + "type": { + "name": "int", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 715, + "module": "_utils.byte_size", + "name": "__lt__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/byte_size.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 80 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 716, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__lt__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 717, + "kind": 32768, + "kindString": "Parameter", + "name": "other", + "type": { + "name": "object", + "type": "reference" + } + } + ], + "type": { + "name": "bool", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 718, + "module": "_utils.byte_size", + "name": "__le__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/byte_size.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 85 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 719, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__le__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 720, + "kind": 32768, + "kindString": "Parameter", + "name": "other", + "type": { + "name": "object", + "type": "reference" + } + } + ], + "type": { + "name": "bool", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 721, + "module": "_utils.byte_size", + "name": "__gt__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/byte_size.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 90 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 722, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__gt__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 723, + "kind": 32768, + "kindString": "Parameter", + "name": "other", + "type": { + "name": "object", + "type": "reference" + } + } + ], + "type": { + "name": "bool", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 724, + "module": "_utils.byte_size", + "name": "__ge__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/byte_size.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 95 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 725, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__ge__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 726, + "kind": 32768, + "kindString": "Parameter", + "name": "other", + "type": { + "name": "object", + "type": "reference" + } + } + ], + "type": { + "name": "bool", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 727, + "module": "_utils.byte_size", + "name": "__add__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/byte_size.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 100 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 728, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__add__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 729, + "kind": 32768, + "kindString": "Parameter", + "name": "other", + "type": { + "name": "object", + "type": "reference" + } + } + ], + "type": { + "name": "ByteSize", + "type": "reference", + "target": "681" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 730, + "module": "_utils.byte_size", + "name": "__sub__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/byte_size.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 105 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 731, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__sub__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 732, + "kind": 32768, + "kindString": "Parameter", + "name": "other", + "type": { + "name": "object", + "type": "reference" + } + } + ], + "type": { + "name": "ByteSize", + "type": "reference", + "target": "681" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 733, + "module": "_utils.byte_size", + "name": "__mul__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/byte_size.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 113 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 734, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__mul__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 735, + "kind": 32768, + "kindString": "Parameter", + "name": "other", + "type": { + "name": "object", + "type": "reference" + } + } + ], + "type": { + "name": "ByteSize", + "type": "reference", + "target": "681" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 736, + "module": "_utils.byte_size", + "name": "__truediv__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/byte_size.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 119 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 737, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__truediv__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 738, + "kind": 32768, + "kindString": "Parameter", + "name": "other", + "type": { + "name": "object", + "type": "reference" + } + } + ], + "type": { + "name": "float", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 739, + "module": "_utils.byte_size", + "name": "__rmul__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/byte_size.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 127 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 740, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__rmul__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 741, + "kind": 32768, + "kindString": "Parameter", + "name": "other", + "type": { + "name": "object", + "type": "reference" + } + } + ], + "type": { + "name": "ByteSize", + "type": "reference", + "target": "681" + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Represents a byte size." + } + ] + }, + "decorations": [ + { + "args": "(frozen=True)", + "name": "dataclass" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 727, + 710, + 724, + 721, + 713, + 718, + 715, + 733, + 683, + 739, + 708, + 730, + 736, + 694, + 688, + 691, + 697, + 704, + 700, + 702, + 706, + 685 + ], + "title": "Methods" + }, + { + "children": [ + 682 + ], + "title": "Properties" + } + ], + "id": 681, + "module": "_utils.byte_size", + "name": "ByteSize", + "parsedDocstring": { + "text": "Represents a byte size." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/byte_size.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 13 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 742, + "module": "_utils.console", + "name": "BORDER", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/console.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 8 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a text table using Unicode characters.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 743, + "module": "_utils.console", + "name": "make_table", + "parsedDocstring": { + "text": "Create a text table using Unicode characters.\n", + "args": { + "rows": "A list of tuples/lists to be displayed in the table.", + "width": "Maximum width of the table." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/console.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 11 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a text table using Unicode characters.\n" + } + ] + }, + "flags": {}, + "id": 744, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "make_table", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A list of tuples/lists to be displayed in the table." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 745, + "kind": 32768, + "kindString": "Parameter", + "name": "rows", + "type": { + "name": "Sequence", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Sequence", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum width of the table." + } + ] + }, + "defaultValue": "100", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 746, + "kind": 32768, + "kindString": "Parameter", + "name": "width", + "type": { + "name": "int", + "type": "reference" + } + } + ], + "type": { + "name": "str", + "type": "reference" + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 747, + "module": "_utils.context", + "name": "T", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 8 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Ensure the (async) context manager is initialized before executing the method.\n\nThis decorator checks if the calling instance has an `active` attribute and verifies that it is set to `True`.\nIf the instance is inactive, it raises a `RuntimeError`. Works for both synchronous and asynchronous methods.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 748, + "module": "_utils.context", + "name": "ensure_context", + "parsedDocstring": { + "text": "Ensure the (async) context manager is initialized before executing the method.\n\nThis decorator checks if the calling instance has an `active` attribute and verifies that it is set to `True`.\nIf the instance is inactive, it raises a `RuntimeError`. Works for both synchronous and asynchronous methods.\n", + "args": { + "method": "The method to wrap.\n" + }, + "returns": "The wrapped method with context checking applied." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 11 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "The wrapped method with context checking applied." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Ensure the (async) context manager is initialized before executing the method.\n\nThis decorator checks if the calling instance has an `active` attribute and verifies that it is set to `True`.\nIf the instance is inactive, it raises a `RuntimeError`. Works for both synchronous and asynchronous methods.\n" + } + ] + }, + "flags": {}, + "id": 749, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "ensure_context", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The method to wrap.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 750, + "kind": 32768, + "kindString": "Parameter", + "name": "method", + "type": { + "name": "T", + "type": "reference", + "target": "299" + } + } + ], + "type": { + "name": "T", + "type": "reference", + "target": "299" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Compute a hexadecimal SHA-256 hash of the provided data and returns a substring (prefix) of it.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 751, + "module": "_utils.crypto", + "name": "compute_short_hash", + "parsedDocstring": { + "text": "Compute a hexadecimal SHA-256 hash of the provided data and returns a substring (prefix) of it.\n", + "args": { + "data": "The binary data to be hashed.", + "length": "The length of the hash to be returned.\n" + }, + "returns": "A substring (prefix) of the hexadecimal hash of the data." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/crypto.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 7 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "A substring (prefix) of the hexadecimal hash of the data." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Compute a hexadecimal SHA-256 hash of the provided data and returns a substring (prefix) of it.\n" + } + ] + }, + "flags": {}, + "id": 752, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "compute_short_hash", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The binary data to be hashed." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 753, + "kind": 32768, + "kindString": "Parameter", + "name": "data", + "type": { + "name": "bytes", + "type": "reference", + "target": "682" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The length of the hash to be returned.\n" + } + ] + }, + "defaultValue": "8", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 754, + "kind": 32768, + "kindString": "Parameter", + "name": "length", + "type": { + "name": "int", + "type": "reference" + } + } + ], + "type": { + "name": "str", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Generate a random object ID." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 755, + "module": "_utils.crypto", + "name": "crypto_random_object_id", + "parsedDocstring": { + "text": "Generate a random object ID." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/crypto.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 21 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Generate a random object ID." + } + ] + }, + "flags": {}, + "id": 756, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "crypto_random_object_id", + "parameters": [ + { + "defaultValue": "17", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 757, + "kind": 32768, + "kindString": "Parameter", + "name": "length", + "type": { + "name": "int", + "type": "reference" + } + } + ], + "type": { + "name": "str", + "type": "reference" + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 758, + "module": "_utils.docs", + "name": "GroupName", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/docs.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 7 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 759, + "module": "_utils.docs", + "name": "T", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/docs.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 28 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Mark a symbol for rendering and grouping in documentation.\n\nThis decorator is used solely for documentation purposes and does not modify the behavior\nof the decorated callable.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 760, + "module": "_utils.docs", + "name": "docs_group", + "parsedDocstring": { + "text": "Mark a symbol for rendering and grouping in documentation.\n\nThis decorator is used solely for documentation purposes and does not modify the behavior\nof the decorated callable.\n", + "args": { + "group_name": "The documentation group to which the symbol belongs.\n" + }, + "returns": "The original callable without modification." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/docs.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 31 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "The original callable without modification." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Mark a symbol for rendering and grouping in documentation.\n\nThis decorator is used solely for documentation purposes and does not modify the behavior\nof the decorated callable.\n" + } + ] + }, + "flags": {}, + "id": 761, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "docs_group", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The documentation group to which the symbol belongs.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 762, + "kind": 32768, + "kindString": "Parameter", + "name": "group_name", + "type": { + "name": "GroupName", + "type": "reference", + "target": "758" + } + } + ], + "type": { + "name": "Callable", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "[T]" + }, + { + "type": "reference", + "name": "T", + "target": "299" + } + ] + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Infer the MIME content type from the value.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 763, + "module": "_utils.file", + "name": "infer_mime_type", + "parsedDocstring": { + "text": "Infer the MIME content type from the value.\n", + "args": { + "value": "The value to infer the content type from.\n" + }, + "returns": "The inferred MIME content type." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/file.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 63 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "The inferred MIME content type." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Infer the MIME content type from the value.\n" + } + ] + }, + "flags": {}, + "id": 764, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "infer_mime_type", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The value to infer the content type from.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 765, + "kind": 32768, + "kindString": "Parameter", + "name": "value", + "type": { + "name": "Any", + "type": "reference" + } + } + ], + "type": { + "name": "str", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Serialize an object to a JSON-formatted string with specific settings.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 766, + "module": "_utils.file", + "name": "json_dumps", + "parsedDocstring": { + "text": "Serialize an object to a JSON-formatted string with specific settings.\n", + "args": { + "obj": "The object to serialize.\n" + }, + "returns": "A string containing the JSON representation of the input object." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/file.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 88 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "A string containing the JSON representation of the input object." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Serialize an object to a JSON-formatted string with specific settings.\n" + } + ] + }, + "flags": {}, + "id": 767, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "json_dumps", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The object to serialize.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 768, + "kind": 32768, + "kindString": "Parameter", + "name": "obj", + "type": { + "name": "Any", + "type": "reference" + } + } + ], + "type": { + "name": "str", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Write data to a file atomically to prevent data corruption or partial writes.\n\nThis function handles both text and binary data. The binary mode is automatically\ndetected based on the data type (bytes = binary, str = text). It ensures atomic\nwriting by creating a temporary file and then atomically replacing the target file,\nwhich prevents data corruption if the process is interrupted during the write operation.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 769, + "module": "_utils.file", + "name": "atomic_write", + "parsedDocstring": { + "text": "Write data to a file atomically to prevent data corruption or partial writes.\n\nThis function handles both text and binary data. The binary mode is automatically\ndetected based on the data type (bytes = binary, str = text). It ensures atomic\nwriting by creating a temporary file and then atomically replacing the target file,\nwhich prevents data corruption if the process is interrupted during the write operation.\n", + "args": { + "path": "The path to the destination file.", + "data": "The data to write to the file (string or bytes).", + "retry_count": "Internal parameter to track the number of retry attempts (default: 0)." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/file.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 118 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Write data to a file atomically to prevent data corruption or partial writes.\n\nThis function handles both text and binary data. The binary mode is automatically\ndetected based on the data type (bytes = binary, str = text). It ensures atomic\nwriting by creating a temporary file and then atomically replacing the target file,\nwhich prevents data corruption if the process is interrupted during the write operation.\n" + } + ] + }, + "flags": {}, + "id": 770, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "atomic_write", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The path to the destination file." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 771, + "kind": 32768, + "kindString": "Parameter", + "name": "path", + "type": { + "name": "Path", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The data to write to the file (string or bytes)." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 772, + "kind": 32768, + "kindString": "Parameter", + "name": "data", + "type": { + "name": "str | bytes", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "bytes", + "target": "682" + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Internal parameter to track the number of retry attempts (default: 0)." + } + ] + }, + "defaultValue": "0", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 773, + "kind": 32768, + "kindString": "Parameter", + "name": "retry_count", + "type": { + "name": "int", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 774, + "module": "_utils.file", + "name": "export_json_to_stream", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/file.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 152 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 775, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "export_json_to_stream", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 776, + "kind": 32768, + "kindString": "Parameter", + "name": "iterator", + "type": { + "name": "AsyncIterator", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "dict", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Any" + } + ] + } + ] + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 777, + "kind": 32768, + "kindString": "Parameter", + "name": "dst", + "type": { + "name": "TextIO", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True (default: False), dict keys that are not of a basic type (str, int, float, bool, None) will be skipped\ninstead of raising a `TypeError`." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 543, + "module": "_types", + "name": "skipkeys", + "parsedDocstring": { + "text": "If True (default: False), dict keys that are not of a basic type (str, int, float, bool, None) will be skipped\ninstead of raising a `TypeError`." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 753 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Determines if non-ASCII characters should be escaped in the output JSON string." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 544, + "module": "_types", + "name": "ensure_ascii", + "parsedDocstring": { + "text": "Determines if non-ASCII characters should be escaped in the output JSON string." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 757 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If False (default: True), skips the circular reference check for container types. A circular reference will\nresult in a `RecursionError` or worse if unchecked." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 545, + "module": "_types", + "name": "check_circular", + "parsedDocstring": { + "text": "If False (default: True), skips the circular reference check for container types. A circular reference will\nresult in a `RecursionError` or worse if unchecked." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 760 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If False (default: True), raises a ValueError for out-of-range float values (nan, inf, -inf) to strictly comply\nwith the JSON specification. If True, uses their JavaScript equivalents (NaN, Infinity, -Infinity)." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 546, + "module": "_types", + "name": "allow_nan", + "parsedDocstring": { + "text": "If False (default: True), raises a ValueError for out-of-range float values (nan, inf, -inf) to strictly comply\nwith the JSON specification. If True, uses their JavaScript equivalents (NaN, Infinity, -Infinity)." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 764 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Allows specifying a custom JSON encoder." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 547, + "module": "_types", + "name": "cls", + "parsedDocstring": { + "text": "Allows specifying a custom JSON encoder." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 768 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "type", + "typeArguments": [ + { + "type": "reference", + "name": "json.JSONEncoder" + } + ], + "target": "981" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Specifies the number of spaces for indentation in the pretty-printed JSON output." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 548, + "module": "_types", + "name": "indent", + "parsedDocstring": { + "text": "Specifies the number of spaces for indentation in the pretty-printed JSON output." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 771 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A tuple of (item_separator, key_separator). The default is (', ', ': ') if indent is None and (',', ': ')\notherwise." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 549, + "module": "_types", + "name": "separators", + "parsedDocstring": { + "text": "A tuple of (item_separator, key_separator). The default is (', ', ': ') if indent is None and (',', ': ')\notherwise." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 774 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "tuple", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "str" + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A function called for objects that can't be serialized otherwise. It should return a JSON-encodable version\nof the object or raise a `TypeError`." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 550, + "module": "_types", + "name": "default", + "parsedDocstring": { + "text": "A function called for objects that can't be serialized otherwise. It should return a JSON-encodable version\nof the object or raise a `TypeError`." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 778 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Callable" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Specifies whether the output JSON object should have keys sorted alphabetically." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 551, + "module": "_types", + "name": "sort_keys", + "parsedDocstring": { + "text": "Specifies whether the output JSON object should have keys sorted alphabetically." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 782 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 779, + "module": "_utils.file", + "name": "export_csv_to_stream", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/file.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 161 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 780, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "export_csv_to_stream", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 781, + "kind": 32768, + "kindString": "Parameter", + "name": "iterator", + "type": { + "name": "AsyncIterator", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "dict", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Any" + } + ] + } + ] + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 782, + "kind": 32768, + "kindString": "Parameter", + "name": "dst", + "type": { + "name": "TextIO", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Specifies a dialect to be used in CSV parsing and writing." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 553, + "module": "_types", + "name": "dialect", + "parsedDocstring": { + "text": "Specifies a dialect to be used in CSV parsing and writing." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 789 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A one-character string used to separate fields. Defaults to ','." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 554, + "module": "_types", + "name": "delimiter", + "parsedDocstring": { + "text": "A one-character string used to separate fields. Defaults to ','." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 792 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Controls how instances of `quotechar` inside a field should be quoted. When True, the character is doubled;\nwhen False, the `escapechar` is used as a prefix. Defaults to True." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 555, + "module": "_types", + "name": "doublequote", + "parsedDocstring": { + "text": "Controls how instances of `quotechar` inside a field should be quoted. When True, the character is doubled;\nwhen False, the `escapechar` is used as a prefix. Defaults to True." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 795 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A one-character string used to escape the delimiter if `quoting` is set to `QUOTE_NONE` and the `quotechar`\nif `doublequote` is False. Defaults to None, disabling escaping." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 556, + "module": "_types", + "name": "escapechar", + "parsedDocstring": { + "text": "A one-character string used to escape the delimiter if `quoting` is set to `QUOTE_NONE` and the `quotechar`\nif `doublequote` is False. Defaults to None, disabling escaping." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 799 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The string used to terminate lines produced by the writer. Defaults to '\\r\\n'." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 557, + "module": "_types", + "name": "lineterminator", + "parsedDocstring": { + "text": "The string used to terminate lines produced by the writer. Defaults to '\\r\\n'." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 803 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A one-character string used to quote fields containing special characters, like the delimiter or quotechar,\nor fields containing new-line characters. Defaults to '\"'." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 558, + "module": "_types", + "name": "quotechar", + "parsedDocstring": { + "text": "A one-character string used to quote fields containing special characters, like the delimiter or quotechar,\nor fields containing new-line characters. Defaults to '\"'." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 806 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Controls when quotes should be generated by the writer and recognized by the reader. Can take any of\nthe `QUOTE_*` constants, with a default of `QUOTE_MINIMAL`." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 559, + "module": "_types", + "name": "quoting", + "parsedDocstring": { + "text": "Controls when quotes should be generated by the writer and recognized by the reader. Can take any of\nthe `QUOTE_*` constants, with a default of `QUOTE_MINIMAL`." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 810 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "When True, spaces immediately following the delimiter are ignored. Defaults to False." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 560, + "module": "_types", + "name": "skipinitialspace", + "parsedDocstring": { + "text": "When True, spaces immediately following the delimiter are ignored. Defaults to False." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 814 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "When True, raises an exception on bad CSV input. Defaults to False." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 561, + "module": "_types", + "name": "strict", + "parsedDocstring": { + "text": "When True, raises an exception on bad CSV input. Defaults to False." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 817 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 785, + "module": "_utils.globs", + "name": "__init__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/globs.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 14 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 786, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 787, + "kind": 32768, + "kindString": "Parameter", + "name": "glob", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Wraps a glob pattern (supports the `*`, `**`, `?` wildcards)." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 785 + ], + "title": "Methods" + } + ], + "id": 784, + "module": "_utils.globs", + "name": "Glob", + "parsedDocstring": { + "text": "Wraps a glob pattern (supports the `*`, `**`, `?` wildcards)." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/globs.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 11 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 788, + "module": "_utils.html_to_text", + "name": "SKIP_TAGS", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/html_to_text.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 9 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 789, + "module": "_utils.html_to_text", + "name": "BLOCK_TAGS", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/html_to_text.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 10 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 790, + "module": "_utils.models", + "name": "timedelta_ms", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 67 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 791, + "module": "_utils.models", + "name": "timedelta_secs", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 68 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Raise ValueError if there are more non-None kwargs then max_kwargs." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 792, + "module": "_utils.raise_if_too_many_kwargs", + "name": "raise_if_too_many_kwargs", + "parsedDocstring": { + "text": "Raise ValueError if there are more non-None kwargs then max_kwargs." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/raise_if_too_many_kwargs.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 4 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Raise ValueError if there are more non-None kwargs then max_kwargs." + } + ] + }, + "flags": {}, + "id": 793, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "raise_if_too_many_kwargs", + "parameters": [ + { + "defaultValue": "1", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 794, + "kind": 32768, + "kindString": "Parameter", + "name": "max_kwargs", + "type": { + "name": "int", + "type": "reference" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 795, + "kind": 32768, + "kindString": "Parameter", + "name": "kwargs", + "type": { + "name": "Any", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 796, + "module": "_utils.recoverable_state", + "name": "TStateModel", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/recoverable_state.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 16 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new recoverable state object.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 798, + "module": "_utils.recoverable_state", + "name": "__init__", + "parsedDocstring": { + "text": "Initialize a new recoverable state object.\n", + "args": { + "default_state": "The default state model instance to use when no persisted state is found.\nA deep copy is made each time the state is used.", + "persist_state_key": "The key under which the state is stored in the KeyValueStore", + "persistence_enabled": "Flag to enable or disable state persistence. Use 'explicit_only' if you want to be able\nto save the state manually, but without any automatic persistence.", + "persist_state_kvs_name": "The name of the KeyValueStore to use for persistence.\nIf neither a name nor and id are supplied, the default store will be used.", + "persist_state_kvs_id": "The identifier of the KeyValueStore to use for persistence.\nIf neither a name nor and id are supplied, the default store will be used.", + "persist_state_kvs_factory": "Factory that can be awaited to create KeyValueStore to use for persistence. If\nnot provided, a system-wide KeyValueStore will be used, based on service locator configuration.", + "logger": "A logger instance for logging operations related to state persistence" + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/recoverable_state.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 34 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new recoverable state object.\n" + } + ] + }, + "flags": {}, + "id": 799, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The default state model instance to use when no persisted state is found.\nA deep copy is made each time the state is used." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 800, + "kind": 32768, + "kindString": "Parameter", + "name": "default_state", + "type": { + "name": "TStateModel", + "type": "reference", + "target": "796" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The key under which the state is stored in the KeyValueStore" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 801, + "kind": 32768, + "kindString": "Parameter", + "name": "persist_state_key", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Flag to enable or disable state persistence. Use 'explicit_only' if you want to be able\nto save the state manually, but without any automatic persistence." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 802, + "kind": 32768, + "kindString": "Parameter", + "name": "persistence_enabled", + "type": { + "name": "Literal", + "type": "reference", + "typeArguments": [ + { + "type": "literal", + "value": true + }, + { + "type": "literal", + "value": false + }, + { + "type": "literal", + "value": "explicit_only" + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The name of the KeyValueStore to use for persistence.\nIf neither a name nor and id are supplied, the default store will be used." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 803, + "kind": 32768, + "kindString": "Parameter", + "name": "persist_state_kvs_name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The identifier of the KeyValueStore to use for persistence.\nIf neither a name nor and id are supplied, the default store will be used." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 804, + "kind": 32768, + "kindString": "Parameter", + "name": "persist_state_kvs_id", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Factory that can be awaited to create KeyValueStore to use for persistence. If\nnot provided, a system-wide KeyValueStore will be used, based on service locator configuration." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 805, + "kind": 32768, + "kindString": "Parameter", + "name": "persist_state_kvs_factory", + "type": { + "name": "Callable[[], Coroutine[None, None, KeyValueStore]] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Callable", + "typeArguments": [ + { + "type": "reference", + "name": "[]" + }, + { + "type": "reference", + "name": "Coroutine", + "typeArguments": [ + { + "type": "literal", + "value": null + }, + { + "type": "literal", + "value": null + }, + { + "type": "reference", + "name": "KeyValueStore", + "target": "3700" + } + ] + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A logger instance for logging operations related to state persistence" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 806, + "kind": 32768, + "kindString": "Parameter", + "name": "logger", + "type": { + "name": "logging.Logger", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize the recoverable state.\n\nThis method must be called before using the recoverable state. It loads the saved state\nif persistence is enabled and registers the object to listen for PERSIST_STATE events.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 807, + "module": "_utils.recoverable_state", + "name": "initialize", + "parsedDocstring": { + "text": "Initialize the recoverable state.\n\nThis method must be called before using the recoverable state. It loads the saved state\nif persistence is enabled and registers the object to listen for PERSIST_STATE events.\n", + "returns": "The loaded state model" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/recoverable_state.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 93 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "The loaded state model" + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Initialize the recoverable state.\n\nThis method must be called before using the recoverable state. It loads the saved state\nif persistence is enabled and registers the object to listen for PERSIST_STATE events.\n" + } + ] + }, + "flags": {}, + "id": 808, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "initialize", + "parameters": [], + "type": { + "name": "TStateModel", + "type": "reference", + "target": "796" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Clean up resources used by the recoverable state.\n\nIf persistence is enabled, this method deregisters the object from PERSIST_STATE events\nand persists the current state one last time." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 809, + "module": "_utils.recoverable_state", + "name": "teardown", + "parsedDocstring": { + "text": "Clean up resources used by the recoverable state.\n\nIf persistence is enabled, this method deregisters the object from PERSIST_STATE events\nand persists the current state one last time." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/recoverable_state.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 121 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Clean up resources used by the recoverable state.\n\nIf persistence is enabled, this method deregisters the object from PERSIST_STATE events\nand persists the current state one last time." + } + ] + }, + "flags": {}, + "id": 810, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "teardown", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the current state." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 811, + "module": "_utils.recoverable_state", + "name": "current_value", + "parsedDocstring": { + "text": "Get the current state." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/recoverable_state.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 139 + } + ], + "type": { + "name": "TStateModel", + "type": "reference", + "target": "796" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Check if the state has already been initialized." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 812, + "module": "_utils.recoverable_state", + "name": "is_initialized", + "parsedDocstring": { + "text": "Check if the state has already been initialized." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/recoverable_state.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 147 + } + ], + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Check if there is any persisted state in the key-value store." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 813, + "module": "_utils.recoverable_state", + "name": "has_persisted_state", + "parsedDocstring": { + "text": "Check if there is any persisted state in the key-value store." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/recoverable_state.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 151 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Check if there is any persisted state in the key-value store." + } + ] + }, + "flags": {}, + "id": 814, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "has_persisted_state", + "parameters": [], + "type": { + "name": "bool", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Reset the state to the default values and clear any persisted state.\n\nResets the current state to the default state and, if persistence is enabled,\nclears the persisted state from the KeyValueStore." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 815, + "module": "_utils.recoverable_state", + "name": "reset", + "parsedDocstring": { + "text": "Reset the state to the default values and clear any persisted state.\n\nResets the current state to the default state and, if persistence is enabled,\nclears the persisted state from the KeyValueStore." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/recoverable_state.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 161 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Reset the state to the default values and clear any persisted state.\n\nResets the current state to the default state and, if persistence is enabled,\nclears the persisted state from the KeyValueStore." + } + ] + }, + "flags": {}, + "id": 816, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "reset", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Persist the current state to the KeyValueStore.\n\nThis method is typically called in response to a PERSIST_STATE event, but can also be called\ndirectly when needed.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 817, + "module": "_utils.recoverable_state", + "name": "persist_state", + "parsedDocstring": { + "text": "Persist the current state to the KeyValueStore.\n\nThis method is typically called in response to a PERSIST_STATE event, but can also be called\ndirectly when needed.\n", + "args": { + "event_data": "Optional data associated with a PERSIST_STATE event" + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/recoverable_state.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 175 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Persist the current state to the KeyValueStore.\n\nThis method is typically called in response to a PERSIST_STATE event, but can also be called\ndirectly when needed.\n" + } + ] + }, + "flags": {}, + "id": 818, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "persist_state", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Optional data associated with a PERSIST_STATE event" + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 819, + "kind": 32768, + "kindString": "Parameter", + "name": "event_data", + "type": { + "name": "EventPersistStateData | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "EventPersistStateData", + "target": "1881" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A class for managing persistent recoverable state using a Pydantic model.\n\nThis class facilitates state persistence to a `KeyValueStore`, allowing data to be saved and retrieved\nacross migrations or restarts. It manages the loading, saving, and resetting of state data,\nwith optional persistence capabilities.\n\nThe state is represented by a Pydantic model that can be serialized to and deserialized from JSON.\nThe class automatically hooks into the event system to persist state when needed.\n\nType Parameters:\nTStateModel: A Pydantic BaseModel type that defines the structure of the state data.\nTypically, it should be inferred from the `default_state` constructor parameter." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 798, + 813, + 807, + 817, + 815, + 809 + ], + "title": "Methods" + }, + { + "children": [ + 811, + 812 + ], + "title": "Properties" + } + ], + "id": 797, + "module": "_utils.recoverable_state", + "name": "RecoverableState", + "parsedDocstring": { + "text": "A class for managing persistent recoverable state using a Pydantic model.\n\nThis class facilitates state persistence to a `KeyValueStore`, allowing data to be saved and retrieved\nacross migrations or restarts. It manages the loading, saving, and resetting of state data,\nwith optional persistence capabilities.\n\nThe state is represented by a Pydantic model that can be serialized to and deserialized from JSON.\nThe class automatically hooks into the event system to persist state when needed.\n\nType Parameters:\nTStateModel: A Pydantic BaseModel type that defines the structure of the state data.\nTypically, it should be inferred from the `default_state` constructor parameter." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/recoverable_state.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 19 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 820, + "module": "_utils.recurring_task", + "name": "logger", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/recurring_task.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 15 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 822, + "module": "_utils.recurring_task", + "name": "__init__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/recurring_task.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 27 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 823, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 824, + "kind": 32768, + "kindString": "Parameter", + "name": "func", + "type": { + "name": "Callable", + "type": "reference" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 825, + "kind": 32768, + "kindString": "Parameter", + "name": "delay", + "type": { + "name": "timedelta", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 826, + "module": "_utils.recurring_task", + "name": "__aenter__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/recurring_task.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 37 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 827, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "__aenter__", + "parameters": [], + "type": { + "name": "Self", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 828, + "module": "_utils.recurring_task", + "name": "__aexit__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/recurring_task.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 41 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 829, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "__aexit__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 830, + "kind": 32768, + "kindString": "Parameter", + "name": "exc_type", + "type": { + "name": "type[BaseException] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "type", + "typeArguments": [ + { + "type": "reference", + "name": "BaseException" + } + ], + "target": "981" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 831, + "kind": 32768, + "kindString": "Parameter", + "name": "exc_value", + "type": { + "name": "BaseException | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "BaseException" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 832, + "kind": 32768, + "kindString": "Parameter", + "name": "exc_traceback", + "type": { + "name": "TracebackType | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "TracebackType" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Start the recurring task execution." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 833, + "module": "_utils.recurring_task", + "name": "start", + "parsedDocstring": { + "text": "Start the recurring task execution." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/recurring_task.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 60 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Start the recurring task execution." + } + ] + }, + "flags": {}, + "id": 834, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "start", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Stop the recurring task execution." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 835, + "module": "_utils.recurring_task", + "name": "stop", + "parsedDocstring": { + "text": "Stop the recurring task execution." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/recurring_task.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 68 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Stop the recurring task execution." + } + ] + }, + "flags": {}, + "id": 836, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "stop", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Class for creating and managing recurring tasks.\n" + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 826, + 828, + 822, + 833, + 835 + ], + "title": "Methods" + } + ], + "id": 821, + "module": "_utils.recurring_task", + "name": "RecurringTask", + "parsedDocstring": { + "text": "Class for creating and managing recurring tasks.\n" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/recurring_task.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 18 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 837, + "module": "_utils.requests", + "name": "logger", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/requests.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 13 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Normalize a URL.\n\nThis function cleans and standardizes a URL by removing leading and trailing whitespaces,\nconverting the scheme and netloc to lower case, stripping unwanted tracking parameters\n(specifically those beginning with 'utm_'), sorting the remaining query parameters alphabetically,\nand optionally retaining the URL fragment. The goal is to ensure that URLs that are functionally\nidentical but differ in trivial ways (such as parameter order or casing) are treated as the same.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 838, + "module": "_utils.requests", + "name": "normalize_url", + "parsedDocstring": { + "text": "Normalize a URL.\n\nThis function cleans and standardizes a URL by removing leading and trailing whitespaces,\nconverting the scheme and netloc to lower case, stripping unwanted tracking parameters\n(specifically those beginning with 'utm_'), sorting the remaining query parameters alphabetically,\nand optionally retaining the URL fragment. The goal is to ensure that URLs that are functionally\nidentical but differ in trivial ways (such as parameter order or casing) are treated as the same.\n", + "args": { + "url": "The URL to be normalized.", + "keep_url_fragment": "Flag to determine whether the fragment part of the URL should be retained.\n" + }, + "returns": "A string containing the normalized URL." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/requests.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 16 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "A string containing the normalized URL." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Normalize a URL.\n\nThis function cleans and standardizes a URL by removing leading and trailing whitespaces,\nconverting the scheme and netloc to lower case, stripping unwanted tracking parameters\n(specifically those beginning with 'utm_'), sorting the remaining query parameters alphabetically,\nand optionally retaining the URL fragment. The goal is to ensure that URLs that are functionally\nidentical but differ in trivial ways (such as parameter order or casing) are treated as the same.\n" + } + ] + }, + "flags": {}, + "id": 839, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "normalize_url", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The URL to be normalized." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 840, + "kind": 32768, + "kindString": "Parameter", + "name": "url", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Flag to determine whether the fragment part of the URL should be retained.\n" + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 841, + "kind": 32768, + "kindString": "Parameter", + "name": "keep_url_fragment", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "str", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Compute a unique key for caching & deduplication of requests.\n\nThis function computes a unique key by normalizing the provided URL and method. If `use_extended_unique_key`\nis True and a payload is provided, the payload is hashed and included in the key. Otherwise, the unique key\nis just the normalized URL. Additionally, if HTTP headers are provided, the whitelisted headers are hashed\nand included in the key.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 842, + "module": "_utils.requests", + "name": "compute_unique_key", + "parsedDocstring": { + "text": "Compute a unique key for caching & deduplication of requests.\n\nThis function computes a unique key by normalizing the provided URL and method. If `use_extended_unique_key`\nis True and a payload is provided, the payload is hashed and included in the key. Otherwise, the unique key\nis just the normalized URL. Additionally, if HTTP headers are provided, the whitelisted headers are hashed\nand included in the key.\n", + "args": { + "url": "The request URL.", + "method": "The HTTP method.", + "headers": "The HTTP headers.", + "payload": "The data to be sent as the request body.", + "keep_url_fragment": "A flag indicating whether to keep the URL fragment.", + "use_extended_unique_key": "A flag indicating whether to include a hashed payload in the key.", + "session_id": "The ID of a specific `Session` to which the request will be strictly bound\n" + }, + "returns": "A string representing the unique key for the request." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/requests.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 50 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "A string representing the unique key for the request." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Compute a unique key for caching & deduplication of requests.\n\nThis function computes a unique key by normalizing the provided URL and method. If `use_extended_unique_key`\nis True and a payload is provided, the payload is hashed and included in the key. Otherwise, the unique key\nis just the normalized URL. Additionally, if HTTP headers are provided, the whitelisted headers are hashed\nand included in the key.\n" + } + ] + }, + "flags": {}, + "id": 843, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "compute_unique_key", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The request URL." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 844, + "kind": 32768, + "kindString": "Parameter", + "name": "url", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The HTTP method." + } + ] + }, + "defaultValue": "'GET'", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 845, + "kind": 32768, + "kindString": "Parameter", + "name": "method", + "type": { + "name": "HttpMethod", + "type": "reference", + "target": "300" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The HTTP headers." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 846, + "kind": 32768, + "kindString": "Parameter", + "name": "headers", + "type": { + "name": "HttpHeaders | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "HttpHeaders", + "target": "306" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The data to be sent as the request body." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 847, + "kind": 32768, + "kindString": "Parameter", + "name": "payload", + "type": { + "name": "HttpPayload | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "HttpPayload", + "target": "301" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The ID of a specific `Session` to which the request will be strictly bound\n" + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 848, + "kind": 32768, + "kindString": "Parameter", + "name": "session_id", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A flag indicating whether to keep the URL fragment." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 849, + "kind": 32768, + "kindString": "Parameter", + "name": "keep_url_fragment", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A flag indicating whether to include a hashed payload in the key." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 850, + "kind": 32768, + "kindString": "Parameter", + "name": "use_extended_unique_key", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "str", + "type": "reference" + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 851, + "module": "_utils.robots", + "name": "logger", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/robots.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 19 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 853, + "module": "_utils.robots", + "name": "__init__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/robots.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 23 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 854, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 855, + "kind": 32768, + "kindString": "Parameter", + "name": "url", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 856, + "kind": 32768, + "kindString": "Parameter", + "name": "robots", + "type": { + "name": "Protego", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 857, + "kind": 32768, + "kindString": "Parameter", + "name": "http_client", + "type": { + "name": "HttpClient | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "HttpClient", + "target": "2070" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 858, + "kind": 32768, + "kindString": "Parameter", + "name": "proxy_info", + "type": { + "name": "ProxyInfo | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ProxyInfo", + "target": "254" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a `RobotsTxtFile` instance from the given content.\n" + } + ] + }, + "decorations": [ + { + "name": "classmethod" + } + ], + "flags": {}, + "groups": [], + "id": 859, + "module": "_utils.robots", + "name": "from_content", + "parsedDocstring": { + "text": "Create a `RobotsTxtFile` instance from the given content.\n", + "args": { + "url": "The URL associated with the robots.txt file.", + "content": "The raw string content of the robots.txt file to be parsed." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/robots.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 32 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a `RobotsTxtFile` instance from the given content.\n" + } + ] + }, + "flags": {}, + "id": 860, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "from_content", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The URL associated with the robots.txt file." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 861, + "kind": 32768, + "kindString": "Parameter", + "name": "url", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The raw string content of the robots.txt file to be parsed." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 862, + "kind": 32768, + "kindString": "Parameter", + "name": "content", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "Self", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Determine the location of a robots.txt file for a URL and fetch it.\n" + } + ] + }, + "decorations": [ + { + "name": "classmethod" + } + ], + "flags": {}, + "groups": [], + "id": 863, + "module": "_utils.robots", + "name": "find", + "parsedDocstring": { + "text": "Determine the location of a robots.txt file for a URL and fetch it.\n", + "args": { + "url": "The URL whose domain will be used to find the corresponding robots.txt file.", + "http_client": "Optional `ProxyInfo` to be used when fetching the robots.txt file. If None, no proxy is used.", + "proxy_info": "The `HttpClient` instance used to perform the network request for fetching the robots.txt file." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/robots.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 43 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Determine the location of a robots.txt file for a URL and fetch it.\n" + } + ] + }, + "flags": {}, + "id": 864, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "find", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The URL whose domain will be used to find the corresponding robots.txt file." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 865, + "kind": 32768, + "kindString": "Parameter", + "name": "url", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Optional `ProxyInfo` to be used when fetching the robots.txt file. If None, no proxy is used." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 866, + "kind": 32768, + "kindString": "Parameter", + "name": "http_client", + "type": { + "name": "HttpClient", + "type": "reference", + "target": "2070" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The `HttpClient` instance used to perform the network request for fetching the robots.txt file." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 867, + "kind": 32768, + "kindString": "Parameter", + "name": "proxy_info", + "type": { + "name": "ProxyInfo | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ProxyInfo", + "target": "254" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "Self", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Load the robots.txt file for a given URL.\n" + } + ] + }, + "decorations": [ + { + "name": "classmethod" + } + ], + "flags": {}, + "groups": [], + "id": 868, + "module": "_utils.robots", + "name": "load", + "parsedDocstring": { + "text": "Load the robots.txt file for a given URL.\n", + "args": { + "url": "The direct URL of the robots.txt file to be loaded.", + "http_client": "The `HttpClient` instance used to perform the network request for fetching the robots.txt file.", + "proxy_info": "Optional `ProxyInfo` to be used when fetching the robots.txt file. If None, no proxy is used." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/robots.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 55 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Load the robots.txt file for a given URL.\n" + } + ] + }, + "flags": {}, + "id": 869, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "load", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The direct URL of the robots.txt file to be loaded." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 870, + "kind": 32768, + "kindString": "Parameter", + "name": "url", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The `HttpClient` instance used to perform the network request for fetching the robots.txt file." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 871, + "kind": 32768, + "kindString": "Parameter", + "name": "http_client", + "type": { + "name": "HttpClient", + "type": "reference", + "target": "2070" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Optional `ProxyInfo` to be used when fetching the robots.txt file. If None, no proxy is used." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 872, + "kind": 32768, + "kindString": "Parameter", + "name": "proxy_info", + "type": { + "name": "ProxyInfo | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ProxyInfo", + "target": "254" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "Self", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Check if the given URL is allowed for the given user agent.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 873, + "module": "_utils.robots", + "name": "is_allowed", + "parsedDocstring": { + "text": "Check if the given URL is allowed for the given user agent.\n", + "args": { + "url": "The URL to check against the robots.txt rules.", + "user_agent": "The user-agent string to check permissions for. Defaults to '*' which matches any user-agent." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/robots.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 80 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Check if the given URL is allowed for the given user agent.\n" + } + ] + }, + "flags": {}, + "id": 874, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "is_allowed", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The URL to check against the robots.txt rules." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 875, + "kind": 32768, + "kindString": "Parameter", + "name": "url", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The user-agent string to check permissions for. Defaults to '*' which matches any user-agent." + } + ] + }, + "defaultValue": "'*'", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 876, + "kind": 32768, + "kindString": "Parameter", + "name": "user_agent", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "bool", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the list of sitemaps urls from the robots.txt file." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 877, + "module": "_utils.robots", + "name": "get_sitemaps", + "parsedDocstring": { + "text": "Get the list of sitemaps urls from the robots.txt file." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/robots.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 92 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the list of sitemaps urls from the robots.txt file." + } + ] + }, + "flags": {}, + "id": 878, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "get_sitemaps", + "parameters": [], + "type": { + "name": "list", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the crawl delay for the given user agent.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 879, + "module": "_utils.robots", + "name": "get_crawl_delay", + "parsedDocstring": { + "text": "Get the crawl delay for the given user agent.\n", + "args": { + "user_agent": "The user-agent string to check the crawl delay for. Defaults to '*' which matches any\nuser-agent." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/robots.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 96 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the crawl delay for the given user agent.\n" + } + ] + }, + "flags": {}, + "id": 880, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "get_crawl_delay", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The user-agent string to check the crawl delay for. Defaults to '*' which matches any\nuser-agent." + } + ] + }, + "defaultValue": "'*'", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 881, + "kind": 32768, + "kindString": "Parameter", + "name": "user_agent", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "int | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Parse the sitemaps from the robots.txt file and return a `Sitemap` instance." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 882, + "module": "_utils.robots", + "name": "parse_sitemaps", + "parsedDocstring": { + "text": "Parse the sitemaps from the robots.txt file and return a `Sitemap` instance." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/robots.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 106 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Parse the sitemaps from the robots.txt file and return a `Sitemap` instance." + } + ] + }, + "flags": {}, + "id": 883, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "parse_sitemaps", + "parameters": [], + "type": { + "name": "Sitemap", + "type": "reference", + "target": "1026" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Parse the sitemaps in the robots.txt file and return a list URLs." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 884, + "module": "_utils.robots", + "name": "parse_urls_from_sitemaps", + "parsedDocstring": { + "text": "Parse the sitemaps in the robots.txt file and return a list URLs." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/robots.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 114 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Parse the sitemaps in the robots.txt file and return a list URLs." + } + ] + }, + "flags": {}, + "id": 885, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "parse_urls_from_sitemaps", + "parameters": [], + "type": { + "name": "list", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 853, + 863, + 859, + 879, + 877, + 873, + 868, + 882, + 884 + ], + "title": "Methods" + } + ], + "id": 852, + "module": "_utils.robots", + "name": "RobotsTxtFile", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/robots.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 22 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 886, + "module": "_utils.system", + "name": "logger", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/system.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 15 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 888, + "module": "_utils.system", + "name": "model_config", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/system.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 39 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The ratio of CPU currently in use, represented as a float between 0 and 1." + } + ] + }, + "flags": {}, + "groups": [], + "id": 889, + "module": "_utils.system", + "name": "used_ratio", + "parsedDocstring": { + "text": "The ratio of CPU currently in use, represented as a float between 0 and 1." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/system.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 41 + } + ], + "type": { + "name": "float", + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Information about the CPU usage." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 888, + 889 + ], + "title": "Properties" + } + ], + "id": 887, + "module": "_utils.system", + "name": "CpuInfo", + "parsedDocstring": { + "text": "Information about the CPU usage." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/system.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 36 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 891, + "module": "_utils.system", + "name": "model_config", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/system.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 62 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Memory usage of the current Python process and its children." + } + ] + }, + "flags": {}, + "groups": [], + "id": 892, + "module": "_utils.system", + "name": "current_size", + "parsedDocstring": { + "text": "Memory usage of the current Python process and its children." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/system.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 64 + } + ], + "type": { + "name": "ByteSize", + "type": "reference", + "target": "681" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Information about the memory usage." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 892, + 891 + ], + "title": "Properties" + } + ], + "id": 890, + "module": "_utils.system", + "name": "MemoryUsageInfo", + "parsedDocstring": { + "text": "Information about the memory usage." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/system.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 59 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedBy": [ + { + "name": "MemoryInfo", + "target": "893", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 894, + "module": "_utils.system", + "name": "model_config", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/system.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 90 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "overwrites": { + "name": "MemoryUsageInfo.model_config", + "target": 891, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Total memory available in the system." + } + ] + }, + "flags": {}, + "groups": [], + "id": 895, + "module": "_utils.system", + "name": "total_size", + "parsedDocstring": { + "text": "Total memory available in the system." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/system.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 92 + } + ], + "type": { + "name": "ByteSize", + "type": "reference", + "target": "681" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Total memory used by all processes system-wide (including non-crawlee processes)." + } + ] + }, + "flags": {}, + "groups": [], + "id": 896, + "module": "_utils.system", + "name": "system_wide_used_size", + "parsedDocstring": { + "text": "Total memory used by all processes system-wide (including non-crawlee processes)." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/system.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 97 + } + ], + "type": { + "name": "ByteSize", + "type": "reference", + "target": "681" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Memory usage of the current Python process and its children." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4391, + "module": "_utils.system", + "name": "current_size", + "parsedDocstring": { + "text": "Memory usage of the current Python process and its children." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/system.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 64 + } + ], + "type": { + "name": "Annotated[ ByteSize, PlainValidator(ByteSize.validate), PlainSerializer(lambda size: size.bytes), Field(alias='currentSize'), ]", + "type": "reference" + }, + "inheritedFrom": { + "name": "MemoryUsageInfo.current_size", + "target": 892, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Information about system memory." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 4391, + 894, + 896, + 895 + ], + "title": "Properties" + } + ], + "id": 893, + "module": "_utils.system", + "name": "MemoryInfo", + "parsedDocstring": { + "text": "Information about system memory." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/system.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 87 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "MemoryUsageInfo", + "target": "890", + "type": "reference" + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Retrieve the current CPU usage.\n\nIt utilizes the `psutil` library. Function `psutil.cpu_percent()` returns a float representing the current\nsystem-wide CPU utilization as a percentage." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 897, + "module": "_utils.system", + "name": "get_cpu_info", + "parsedDocstring": { + "text": "Retrieve the current CPU usage.\n\nIt utilizes the `psutil` library. Function `psutil.cpu_percent()` returns a float representing the current\nsystem-wide CPU utilization as a percentage." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/system.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 106 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Retrieve the current CPU usage.\n\nIt utilizes the `psutil` library. Function `psutil.cpu_percent()` returns a float representing the current\nsystem-wide CPU utilization as a percentage." + } + ] + }, + "flags": {}, + "id": 898, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "get_cpu_info", + "parameters": [], + "type": { + "name": "CpuInfo", + "type": "reference", + "target": "887" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Retrieve the current memory usage of the process and its children.\n\nIt utilizes the `psutil` library." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 899, + "module": "_utils.system", + "name": "get_memory_info", + "parsedDocstring": { + "text": "Retrieve the current memory usage of the process and its children.\n\nIt utilizes the `psutil` library." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/system.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 117 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Retrieve the current memory usage of the process and its children.\n\nIt utilizes the `psutil` library." + } + ] + }, + "flags": {}, + "id": 900, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "get_memory_info", + "parameters": [], + "type": { + "name": "MemoryInfo", + "type": "reference", + "target": "893" + } + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 902, + "module": "_utils.time", + "name": "wall", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/time.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 21 + } + ], + "type": { + "name": "float | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "float" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 903, + "module": "_utils.time", + "name": "cpu", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/time.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 22 + } + ], + "type": { + "name": "float | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "float" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "name": "dataclass" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 903, + 902 + ], + "title": "Properties" + } + ], + "id": 901, + "module": "_utils.time", + "name": "TimerResult", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/time.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 20 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Measure the execution time (wall-clock and CPU) between the start and end of the with-block." + } + ] + }, + "decorations": [ + { + "name": "contextmanager" + } + ], + "flags": {}, + "groups": [], + "id": 904, + "module": "_utils.time", + "name": "measure_time", + "parsedDocstring": { + "text": "Measure the execution time (wall-clock and CPU) between the start and end of the with-block." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/time.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 26 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Measure the execution time (wall-clock and CPU) between the start and end of the with-block." + } + ] + }, + "flags": {}, + "id": 905, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "measure_time", + "parameters": [], + "type": { + "name": "Iterator", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "TimerResult", + "target": "901" + } + ] + } + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 907, + "module": "_utils.time", + "name": "__init__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/time.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 47 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 908, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 909, + "kind": 32768, + "kindString": "Parameter", + "name": "timeout", + "type": { + "name": "timedelta", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 910, + "module": "_utils.time", + "name": "__aenter__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/time.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 52 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 911, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "__aenter__", + "parameters": [], + "type": { + "name": "timedelta", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 912, + "module": "_utils.time", + "name": "__aexit__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/time.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 61 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 913, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "__aexit__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 914, + "kind": 32768, + "kindString": "Parameter", + "name": "exc_type", + "type": { + "name": "type[BaseException] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "type", + "typeArguments": [ + { + "type": "reference", + "name": "BaseException" + } + ], + "target": "981" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 915, + "kind": 32768, + "kindString": "Parameter", + "name": "exc_value", + "type": { + "name": "BaseException | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "BaseException" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 916, + "kind": 32768, + "kindString": "Parameter", + "name": "exc_traceback", + "type": { + "name": "TracebackType | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "TracebackType" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Keeps track of a time budget shared by multiple independent async operations.\n\nProvides a reusable, non-reentrant context manager interface." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 910, + 912, + 907 + ], + "title": "Methods" + } + ], + "id": 906, + "module": "_utils.time", + "name": "SharedTimeout", + "parsedDocstring": { + "text": "Keeps track of a time budget shared by multiple independent async operations.\n\nProvides a reusable, non-reentrant context manager interface." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/time.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 41 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Format a timedelta into a human-readable string with appropriate units." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 917, + "module": "_utils.time", + "name": "format_duration", + "parsedDocstring": { + "text": "Format a timedelta into a human-readable string with appropriate units." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/time.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 78 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Format a timedelta into a human-readable string with appropriate units." + } + ] + }, + "flags": {}, + "id": 918, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "format_duration", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 919, + "kind": 32768, + "kindString": "Parameter", + "name": "duration", + "type": { + "name": "timedelta | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "timedelta" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "str", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Context manager to attempt importing symbols into a module.\n\nIf an `ImportError` is raised during the import, the symbol will be replaced with a `FailedImport` object." + } + ] + }, + "decorations": [ + { + "name": "contextmanager" + } + ], + "flags": {}, + "groups": [], + "id": 920, + "module": "_utils.try_import", + "name": "try_import", + "parsedDocstring": { + "text": "Context manager to attempt importing symbols into a module.\n\nIf an `ImportError` is raised during the import, the symbol will be replaced with a `FailedImport` object." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/try_import.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 10 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Context manager to attempt importing symbols into a module.\n\nIf an `ImportError` is raised during the import, the symbol will be replaced with a `FailedImport` object." + } + ] + }, + "flags": {}, + "id": 921, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "try_import", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 922, + "kind": 32768, + "kindString": "Parameter", + "name": "module_name", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 923, + "kind": 32768, + "kindString": "Parameter", + "name": "symbol_names", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "Iterator", + "type": "reference", + "typeArguments": [ + { + "type": "literal", + "value": null + } + ] + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Install an import hook for a specified module." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 924, + "module": "_utils.try_import", + "name": "install_import_hook", + "parsedDocstring": { + "text": "Install an import hook for a specified module." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/try_import.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 22 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Install an import hook for a specified module." + } + ] + }, + "flags": {}, + "id": 925, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "install_import_hook", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 926, + "kind": 32768, + "kindString": "Parameter", + "name": "module_name", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The error message associated with the failed import." + } + ] + }, + "flags": {}, + "groups": [], + "id": 928, + "module": "_utils.try_import", + "name": "message", + "parsedDocstring": { + "text": "The error message associated with the failed import." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/try_import.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 31 + } + ], + "type": { + "name": "str", + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Represent a placeholder for a failed import." + } + ] + }, + "decorations": [ + { + "name": "dataclass" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 928 + ], + "title": "Properties" + } + ], + "id": 927, + "module": "_utils.try_import", + "name": "FailedImport", + "parsedDocstring": { + "text": "Represent a placeholder for a failed import." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/try_import.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 28 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 930, + "module": "_utils.try_import", + "name": "__getattribute__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/try_import.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 38 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 931, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__getattribute__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 932, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "Any", + "type": "reference" + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A wrapper class for modules to handle attribute access for failed imports." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 930 + ], + "title": "Methods" + } + ], + "id": 929, + "module": "_utils.try_import", + "name": "ImportWrapper", + "parsedDocstring": { + "text": "A wrapper class for modules to handle attribute access for failed imports." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/try_import.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 35 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Check if a URL is absolute." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 933, + "module": "_utils.urls", + "name": "is_url_absolute", + "parsedDocstring": { + "text": "Check if a URL is absolute." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/urls.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 13 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Check if a URL is absolute." + } + ] + }, + "flags": {}, + "id": 934, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "is_url_absolute", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 935, + "kind": 32768, + "kindString": "Parameter", + "name": "url", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "bool", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Convert a relative URL to an absolute URL using a base URL." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 936, + "module": "_utils.urls", + "name": "convert_to_absolute_url", + "parsedDocstring": { + "text": "Convert a relative URL to an absolute URL using a base URL." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/urls.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 21 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Convert a relative URL to an absolute URL using a base URL." + } + ] + }, + "flags": {}, + "id": 937, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "convert_to_absolute_url", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 938, + "kind": 32768, + "kindString": "Parameter", + "name": "base_url", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 939, + "kind": 32768, + "kindString": "Parameter", + "name": "relative_url", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "str", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Convert an iterator of relative URLs to absolute URLs using a base URL." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 940, + "module": "_utils.urls", + "name": "to_absolute_url_iterator", + "parsedDocstring": { + "text": "Convert an iterator of relative URLs to absolute URLs using a base URL." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/urls.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 26 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Convert an iterator of relative URLs to absolute URLs using a base URL." + } + ] + }, + "flags": {}, + "id": 941, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "to_absolute_url_iterator", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 942, + "kind": 32768, + "kindString": "Parameter", + "name": "base_url", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 943, + "kind": 32768, + "kindString": "Parameter", + "name": "urls", + "type": { + "name": "Iterator", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 944, + "kind": 32768, + "kindString": "Parameter", + "name": "logger", + "type": { + "name": "Logger | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Logger" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "Iterator", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Validate the given HTTP URL.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 945, + "module": "_utils.urls", + "name": "validate_http_url", + "parsedDocstring": { + "text": "Validate the given HTTP URL.\n" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/urls.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 44 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Validate the given HTTP URL.\n" + } + ] + }, + "flags": {}, + "id": 946, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "validate_http_url", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 947, + "kind": 32768, + "kindString": "Parameter", + "name": "value", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 948, + "module": "_utils.wait", + "name": "T", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/wait.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 12 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Wait for an async operation to complete.\n\nIf the wait times out, `TimeoutError` is raised and the future is cancelled.\nOptionally retry on error.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 949, + "module": "_utils.wait", + "name": "wait_for", + "parsedDocstring": { + "text": "Wait for an async operation to complete.\n\nIf the wait times out, `TimeoutError` is raised and the future is cancelled.\nOptionally retry on error.\n", + "args": { + "operation": "A function that returns the future to wait for.", + "timeout": "How long should we wait before cancelling the future.", + "timeout_message": "Message to be included in the `TimeoutError` in case of timeout.", + "max_retries": "How many times should the operation be attempted.", + "logger": "Used to report information about retries as they happen." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/wait.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 15 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Wait for an async operation to complete.\n\nIf the wait times out, `TimeoutError` is raised and the future is cancelled.\nOptionally retry on error.\n" + } + ] + }, + "flags": {}, + "id": 950, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "wait_for", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A function that returns the future to wait for." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 951, + "kind": 32768, + "kindString": "Parameter", + "name": "operation", + "type": { + "name": "Callable", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "[]" + }, + { + "type": "reference", + "name": "Awaitable", + "typeArguments": [ + { + "type": "reference", + "name": "T", + "target": "299" + } + ] + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "How long should we wait before cancelling the future." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 952, + "kind": 32768, + "kindString": "Parameter", + "name": "timeout", + "type": { + "name": "timedelta", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Message to be included in the `TimeoutError` in case of timeout." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 953, + "kind": 32768, + "kindString": "Parameter", + "name": "timeout_message", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "How many times should the operation be attempted." + } + ] + }, + "defaultValue": "1", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 954, + "kind": 32768, + "kindString": "Parameter", + "name": "max_retries", + "type": { + "name": "int", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Used to report information about retries as they happen." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 955, + "kind": 32768, + "kindString": "Parameter", + "name": "logger", + "type": { + "name": "Logger", + "type": "reference" + } + } + ], + "type": { + "name": "T", + "type": "reference", + "target": "299" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Wait for all tasks to finish or until the timeout is reached.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 956, + "module": "_utils.wait", + "name": "wait_for_all_tasks_for_finish", + "parsedDocstring": { + "text": "Wait for all tasks to finish or until the timeout is reached.\n", + "args": { + "tasks": "A sequence of asyncio tasks to wait for.", + "logger": "Logger to use for reporting.", + "timeout": "How long should we wait before cancelling the tasks." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/wait.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 49 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Wait for all tasks to finish or until the timeout is reached.\n" + } + ] + }, + "flags": {}, + "id": 957, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "wait_for_all_tasks_for_finish", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A sequence of asyncio tasks to wait for." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 958, + "kind": 32768, + "kindString": "Parameter", + "name": "tasks", + "type": { + "name": "Sequence", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "asyncio.Task" + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Logger to use for reporting." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 959, + "kind": 32768, + "kindString": "Parameter", + "name": "logger", + "type": { + "name": "Logger", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "How long should we wait before cancelling the tasks." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 960, + "kind": 32768, + "kindString": "Parameter", + "name": "timeout", + "type": { + "name": "timedelta | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "timedelta" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 961, + "module": "_utils.sitemap", + "name": "logger", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/sitemap.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 31 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 962, + "module": "_utils.sitemap", + "name": "VALID_CHANGE_FREQS", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/sitemap.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 33 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 963, + "module": "_utils.sitemap", + "name": "SITEMAP_HEADERS", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/sitemap.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 34 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 964, + "module": "_utils.sitemap", + "name": "SITEMAP_URL_PATTERN", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/sitemap.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 35 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 965, + "module": "_utils.sitemap", + "name": "COMMON_SITEMAP_PATHS", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/sitemap.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 36 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 967, + "module": "_utils.sitemap", + "name": "loc", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/sitemap.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 41 + } + ], + "type": { + "name": "str", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 968, + "module": "_utils.sitemap", + "name": "lastmod", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/sitemap.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 42 + } + ], + "type": { + "name": "datetime | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "datetime" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 969, + "module": "_utils.sitemap", + "name": "changefreq", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/sitemap.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 43 + } + ], + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 970, + "module": "_utils.sitemap", + "name": "priority", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/sitemap.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 44 + } + ], + "type": { + "name": "float | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "float" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 971, + "module": "_utils.sitemap", + "name": "origin_sitemap_url", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/sitemap.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 45 + } + ], + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "args": "()", + "name": "dataclass" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 969, + 968, + 967, + 971, + 970 + ], + "title": "Properties" + } + ], + "id": 966, + "module": "_utils.sitemap", + "name": "SitemapUrl", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/sitemap.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 40 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 973, + "module": "_utils.sitemap", + "name": "loc", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/sitemap.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 50 + } + ], + "type": { + "name": "str", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 974, + "module": "_utils.sitemap", + "name": "origin_sitemap_url", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/sitemap.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 51 + } + ], + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "args": "()", + "name": "dataclass" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 973, + 974 + ], + "title": "Properties" + } + ], + "id": 972, + "module": "_utils.sitemap", + "name": "NestedSitemap", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/sitemap.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 49 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 976, + "module": "_utils.sitemap", + "name": "emit_nested_sitemaps", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/sitemap.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 55 + } + ], + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 977, + "module": "_utils.sitemap", + "name": "max_depth", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/sitemap.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 56 + } + ], + "type": { + "name": "int", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 978, + "module": "_utils.sitemap", + "name": "sitemap_retries", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/sitemap.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 57 + } + ], + "type": { + "name": "int", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 979, + "module": "_utils.sitemap", + "name": "timeout", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/sitemap.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 58 + } + ], + "type": { + "name": "timedelta | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "timedelta" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 976, + 977, + 978, + 979 + ], + "title": "Properties" + } + ], + "id": 975, + "module": "_utils.sitemap", + "name": "ParseSitemapOptions", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/sitemap.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 54 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 981, + "module": "_utils.sitemap", + "name": "type", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/sitemap.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 62 + } + ], + "type": { + "name": "Literal", + "type": "reference", + "typeArguments": [ + { + "type": "literal", + "value": "url" + }, + { + "type": "literal", + "value": "raw" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 982, + "module": "_utils.sitemap", + "name": "url", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/sitemap.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 63 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 983, + "module": "_utils.sitemap", + "name": "content", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/sitemap.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 64 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 984, + "module": "_utils.sitemap", + "name": "depth", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/sitemap.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 65 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 983, + 984, + 981, + 982 + ], + "title": "Properties" + } + ], + "id": 980, + "module": "_utils.sitemap", + "name": "SitemapSource", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/sitemap.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 61 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 986, + "module": "_utils.sitemap", + "name": "type", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/sitemap.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 69 + } + ], + "type": { + "name": "Literal", + "type": "reference", + "typeArguments": [ + { + "type": "literal", + "value": "url" + }, + { + "type": "literal", + "value": "sitemap_url" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 987, + "module": "_utils.sitemap", + "name": "loc", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/sitemap.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 70 + } + ], + "type": { + "name": "str", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 988, + "module": "_utils.sitemap", + "name": "url", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/sitemap.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 71 + } + ], + "type": { + "name": "str", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 989, + "module": "_utils.sitemap", + "name": "lastmod", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/sitemap.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 72 + } + ], + "type": { + "name": "datetime | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "datetime" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 990, + "module": "_utils.sitemap", + "name": "changefreq", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/sitemap.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 73 + } + ], + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 991, + "module": "_utils.sitemap", + "name": "priority", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/sitemap.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 74 + } + ], + "type": { + "name": "float | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "float" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 990, + 989, + 987, + 991, + 986, + 988 + ], + "title": "Properties" + } + ], + "id": 985, + "module": "_utils.sitemap", + "name": "_SitemapItem", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/sitemap.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 68 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 993, + "module": "_utils.sitemap", + "name": "__init__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/sitemap.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 78 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 994, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 995, + "module": "_utils.sitemap", + "name": "items", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/sitemap.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 87 + } + ], + "type": { + "name": "list", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "_SitemapItem", + "target": "985" + } + ] + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "name": "override" + } + ], + "flags": {}, + "groups": [], + "id": 996, + "module": "_utils.sitemap", + "name": "startElement", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/sitemap.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 91 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 997, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "startElement", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 998, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 999, + "kind": 32768, + "kindString": "Parameter", + "name": "attrs", + "type": { + "name": "AttributesImpl", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1000, + "module": "_utils.sitemap", + "name": "characters", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/sitemap.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 99 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 1001, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "characters", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1002, + "kind": 32768, + "kindString": "Parameter", + "name": "content", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "name": "override" + } + ], + "flags": {}, + "groups": [], + "id": 1003, + "module": "_utils.sitemap", + "name": "endElement", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/sitemap.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 104 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 1004, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "endElement", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1005, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 993, + 1000, + 1003, + 996 + ], + "title": "Methods" + }, + { + "children": [ + 995 + ], + "title": "Properties" + } + ], + "id": 992, + "module": "_utils.sitemap", + "name": "_XMLSaxSitemapHandler", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/sitemap.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 77 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1007, + "module": "_utils.sitemap", + "name": "__init__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/sitemap.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 135 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 1008, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Process a chunk of text data and yield items one by one." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1009, + "module": "_utils.sitemap", + "name": "process_chunk", + "parsedDocstring": { + "text": "Process a chunk of text data and yield items one by one." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/sitemap.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 138 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Process a chunk of text data and yield items one by one." + } + ] + }, + "flags": {}, + "id": 1010, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "process_chunk", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1011, + "kind": 32768, + "kindString": "Parameter", + "name": "chunk", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "AsyncGenerator", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "_SitemapItem", + "target": "985" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Process any remaining data in the buffer, yielding items one by one." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1012, + "module": "_utils.sitemap", + "name": "flush", + "parsedDocstring": { + "text": "Process any remaining data in the buffer, yielding items one by one." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/sitemap.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 153 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Process any remaining data in the buffer, yielding items one by one." + } + ] + }, + "flags": {}, + "id": 1013, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "flush", + "parameters": [], + "type": { + "name": "AsyncGenerator", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "_SitemapItem", + "target": "985" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Clean up resources." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1014, + "module": "_utils.sitemap", + "name": "close", + "parsedDocstring": { + "text": "Clean up resources." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/sitemap.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 161 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Clean up resources." + } + ] + }, + "flags": {}, + "id": 1015, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "close", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Parser for plaintext sitemaps that processes data as a stream." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 1007, + 1014, + 1012, + 1009 + ], + "title": "Methods" + } + ], + "id": 1006, + "module": "_utils.sitemap", + "name": "_TxtSitemapParser", + "parsedDocstring": { + "text": "Parser for plaintext sitemaps that processes data as a stream." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/sitemap.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 132 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1017, + "module": "_utils.sitemap", + "name": "__init__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/sitemap.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 169 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 1018, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Process a chunk of XML data and yield items one by one." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1019, + "module": "_utils.sitemap", + "name": "process_chunk", + "parsedDocstring": { + "text": "Process a chunk of XML data and yield items one by one." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/sitemap.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 174 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Process a chunk of XML data and yield items one by one." + } + ] + }, + "flags": {}, + "id": 1020, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "process_chunk", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1021, + "kind": 32768, + "kindString": "Parameter", + "name": "chunk", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "AsyncGenerator", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "_SitemapItem", + "target": "985" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Process any remaining data in the buffer, yielding items one by one." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1022, + "module": "_utils.sitemap", + "name": "flush", + "parsedDocstring": { + "text": "Process any remaining data in the buffer, yielding items one by one." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/sitemap.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 188 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Process any remaining data in the buffer, yielding items one by one." + } + ] + }, + "flags": {}, + "id": 1023, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "flush", + "parameters": [], + "type": { + "name": "AsyncGenerator", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "_SitemapItem", + "target": "985" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Clean up resources." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1024, + "module": "_utils.sitemap", + "name": "close", + "parsedDocstring": { + "text": "Clean up resources." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/sitemap.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 201 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Clean up resources." + } + ] + }, + "flags": {}, + "id": 1025, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "close", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Parser for XML sitemaps using SAX to process data as a stream." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 1017, + 1024, + 1022, + 1019 + ], + "title": "Methods" + } + ], + "id": 1016, + "module": "_utils.sitemap", + "name": "_XmlSitemapParser", + "parsedDocstring": { + "text": "Parser for XML sitemaps using SAX to process data as a stream." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/sitemap.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 166 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1027, + "module": "_utils.sitemap", + "name": "__init__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/sitemap.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 384 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 1028, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1029, + "kind": 32768, + "kindString": "Parameter", + "name": "urls", + "type": { + "name": "list", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 1030, + "module": "_utils.sitemap", + "name": "urls", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/sitemap.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 388 + } + ], + "type": { + "name": "list", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "name": "classmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1031, + "module": "_utils.sitemap", + "name": "try_common_names", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/sitemap.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 392 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 1032, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "try_common_names", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1033, + "kind": 32768, + "kindString": "Parameter", + "name": "url", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1034, + "kind": 32768, + "kindString": "Parameter", + "name": "http_client", + "type": { + "name": "HttpClient", + "type": "reference", + "target": "2070" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1035, + "kind": 32768, + "kindString": "Parameter", + "name": "proxy_info", + "type": { + "name": "ProxyInfo | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ProxyInfo", + "target": "254" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "Sitemap", + "type": "reference", + "target": "1026" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "name": "classmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1036, + "module": "_utils.sitemap", + "name": "load", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/sitemap.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 398 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 1037, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "load", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1038, + "kind": 32768, + "kindString": "Parameter", + "name": "urls", + "type": { + "name": "str | list[str]", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + ] + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1039, + "kind": 32768, + "kindString": "Parameter", + "name": "http_client", + "type": { + "name": "HttpClient", + "type": "reference", + "target": "2070" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1040, + "kind": 32768, + "kindString": "Parameter", + "name": "proxy_info", + "type": { + "name": "ProxyInfo | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ProxyInfo", + "target": "254" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1041, + "kind": 32768, + "kindString": "Parameter", + "name": "parse_sitemap_options", + "type": { + "name": "ParseSitemapOptions | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ParseSitemapOptions", + "target": "975" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "Sitemap", + "type": "reference", + "target": "1026" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "name": "classmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1042, + "module": "_utils.sitemap", + "name": "from_xml_string", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/sitemap.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 412 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 1043, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "from_xml_string", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1044, + "kind": 32768, + "kindString": "Parameter", + "name": "content", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "Sitemap", + "type": "reference", + "target": "1026" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "name": "classmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1045, + "module": "_utils.sitemap", + "name": "parse", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/sitemap.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 416 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 1046, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "parse", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1047, + "kind": 32768, + "kindString": "Parameter", + "name": "sources", + "type": { + "name": "list", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "SitemapSource", + "target": "980" + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1048, + "kind": 32768, + "kindString": "Parameter", + "name": "http_client", + "type": { + "name": "HttpClient | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "HttpClient", + "target": "2070" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1049, + "kind": 32768, + "kindString": "Parameter", + "name": "proxy_info", + "type": { + "name": "ProxyInfo | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ProxyInfo", + "target": "254" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1050, + "kind": 32768, + "kindString": "Parameter", + "name": "parse_sitemap_options", + "type": { + "name": "ParseSitemapOptions | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ParseSitemapOptions", + "target": "975" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "Sitemap", + "type": "reference", + "target": "1026" + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 1027, + 1042, + 1036, + 1045, + 1031 + ], + "title": "Methods" + }, + { + "children": [ + 1030 + ], + "title": "Properties" + } + ], + "id": 1026, + "module": "_utils.sitemap", + "name": "Sitemap", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/sitemap.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 383 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Parse sitemap(s) and yield URLs found in them.\n\nThis function coordinates the process of fetching and parsing sitemaps,\nhandling both URL-based and raw content sources. It follows nested sitemaps\nup to the specified maximum depth." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1051, + "module": "_utils.sitemap", + "name": "parse_sitemap", + "parsedDocstring": { + "text": "Parse sitemap(s) and yield URLs found in them.\n\nThis function coordinates the process of fetching and parsing sitemaps,\nhandling both URL-based and raw content sources. It follows nested sitemaps\nup to the specified maximum depth." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/sitemap.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 427 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Parse sitemap(s) and yield URLs found in them.\n\nThis function coordinates the process of fetching and parsing sitemaps,\nhandling both URL-based and raw content sources. It follows nested sitemaps\nup to the specified maximum depth." + } + ] + }, + "flags": {}, + "id": 1052, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "parse_sitemap", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1053, + "kind": 32768, + "kindString": "Parameter", + "name": "initial_sources", + "type": { + "name": "list", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "SitemapSource", + "target": "980" + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1054, + "kind": 32768, + "kindString": "Parameter", + "name": "http_client", + "type": { + "name": "HttpClient | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "HttpClient", + "target": "2070" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1055, + "kind": 32768, + "kindString": "Parameter", + "name": "proxy_info", + "type": { + "name": "ProxyInfo | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ProxyInfo", + "target": "254" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1056, + "kind": 32768, + "kindString": "Parameter", + "name": "options", + "type": { + "name": "ParseSitemapOptions | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ParseSitemapOptions", + "target": "975" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "AsyncGenerator", + "type": "reference", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "SitemapUrl", + "target": "966" + }, + { + "type": "reference", + "name": "NestedSitemap", + "target": "972" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Discover related sitemaps for the given URLs.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1057, + "module": "_utils.sitemap", + "name": "discover_valid_sitemaps", + "parsedDocstring": { + "text": "Discover related sitemaps for the given URLs.\n", + "args": { + "urls": "List of URLs to discover sitemaps for.", + "http_client": "`HttpClient` to use for making requests.", + "proxy_info": "Proxy configuration to use for requests.", + "request_timeout": "Timeout for each request when checking for sitemaps.", + "method_for_checking": "HTTP method to use when checking for sitemap existence (HEAD or GET)." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/sitemap.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 581 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Discover related sitemaps for the given URLs.\n" + } + ] + }, + "flags": {}, + "id": 1058, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "discover_valid_sitemaps", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "List of URLs to discover sitemaps for." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1059, + "kind": 32768, + "kindString": "Parameter", + "name": "urls", + "type": { + "name": "list", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "`HttpClient` to use for making requests." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 1060, + "kind": 32768, + "kindString": "Parameter", + "name": "http_client", + "type": { + "name": "HttpClient", + "type": "reference", + "target": "2070" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Proxy configuration to use for requests." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1061, + "kind": 32768, + "kindString": "Parameter", + "name": "proxy_info", + "type": { + "name": "ProxyInfo | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ProxyInfo", + "target": "254" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Timeout for each request when checking for sitemaps." + } + ] + }, + "defaultValue": "timedelta(seconds=20)", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1062, + "kind": 32768, + "kindString": "Parameter", + "name": "request_timeout", + "type": { + "name": "timedelta", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP method to use when checking for sitemap existence (HEAD or GET)." + } + ] + }, + "defaultValue": "'HEAD'", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1063, + "kind": 32768, + "kindString": "Parameter", + "name": "method_for_checking", + "type": { + "name": "Literal", + "type": "reference", + "typeArguments": [ + { + "type": "literal", + "value": "HEAD" + }, + { + "type": "literal", + "value": "GET" + } + ] + } + } + ], + "type": { + "name": "AsyncGenerator", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return `True` for 4xx status codes, `False` otherwise." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1064, + "module": "_utils.web", + "name": "is_status_code_client_error", + "parsedDocstring": { + "text": "Return `True` for 4xx status codes, `False` otherwise." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/web.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 6 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return `True` for 4xx status codes, `False` otherwise." + } + ] + }, + "flags": {}, + "id": 1065, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "is_status_code_client_error", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1066, + "kind": 32768, + "kindString": "Parameter", + "name": "value", + "type": { + "name": "int", + "type": "reference" + } + } + ], + "type": { + "name": "bool", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return `True` for 5xx status codes, `False` otherwise." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1067, + "module": "_utils.web", + "name": "is_status_code_server_error", + "parsedDocstring": { + "text": "Return `True` for 5xx status codes, `False` otherwise." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/web.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 11 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return `True` for 5xx status codes, `False` otherwise." + } + ] + }, + "flags": {}, + "id": 1068, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "is_status_code_server_error", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1069, + "kind": 32768, + "kindString": "Parameter", + "name": "value", + "type": { + "name": "int", + "type": "reference" + } + } + ], + "type": { + "name": "bool", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return `True` for 2xx and 3xx status codes, `False` otherwise." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1070, + "module": "_utils.web", + "name": "is_status_code_successful", + "parsedDocstring": { + "text": "Return `True` for 2xx and 3xx status codes, `False` otherwise." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_utils/web.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 16 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return `True` for 2xx and 3xx status codes, `False` otherwise." + } + ] + }, + "flags": {}, + "id": 1071, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "is_status_code_successful", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1072, + "kind": 32768, + "kindString": "Parameter", + "name": "value", + "type": { + "name": "int", + "type": "reference" + } + } + ], + "type": { + "name": "bool", + "type": "reference" + } + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The name of the automation library that the controller is using." + } + ] + }, + "flags": {}, + "groups": [], + "id": 1074, + "module": "browsers._browser_controller", + "name": "AUTOMATION_LIBRARY", + "parsedDocstring": { + "text": "The name of the automation library that the controller is using." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/browsers/_browser_controller.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 24 + } + ], + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the list of opened pages." + } + ] + }, + "decorations": [ + { + "name": "property" + }, + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1075, + "module": "browsers._browser_controller", + "name": "pages", + "parsedDocstring": { + "text": "Return the list of opened pages." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/browsers/_browser_controller.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 29 + } + ], + "type": { + "name": "list", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Page" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the total number of pages opened since the browser was launched." + } + ] + }, + "decorations": [ + { + "name": "property" + }, + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1076, + "module": "browsers._browser_controller", + "name": "total_opened_pages", + "parsedDocstring": { + "text": "Return the total number of pages opened since the browser was launched." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/browsers/_browser_controller.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 34 + } + ], + "type": { + "name": "int", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the number of currently open pages." + } + ] + }, + "decorations": [ + { + "name": "property" + }, + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1077, + "module": "browsers._browser_controller", + "name": "pages_count", + "parsedDocstring": { + "text": "Return the number of currently open pages." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/browsers/_browser_controller.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 39 + } + ], + "type": { + "name": "int", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the time when the last page was opened." + } + ] + }, + "decorations": [ + { + "name": "property" + }, + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1078, + "module": "browsers._browser_controller", + "name": "last_page_opened_at", + "parsedDocstring": { + "text": "Return the time when the last page was opened." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/browsers/_browser_controller.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 44 + } + ], + "type": { + "name": "datetime", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the idle time of the browser controller." + } + ] + }, + "decorations": [ + { + "name": "property" + }, + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1079, + "module": "browsers._browser_controller", + "name": "idle_time", + "parsedDocstring": { + "text": "Return the idle time of the browser controller." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/browsers/_browser_controller.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 49 + } + ], + "type": { + "name": "timedelta", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return if the browser has free capacity to open a new page." + } + ] + }, + "decorations": [ + { + "name": "property" + }, + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1080, + "module": "browsers._browser_controller", + "name": "has_free_capacity", + "parsedDocstring": { + "text": "Return if the browser has free capacity to open a new page." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/browsers/_browser_controller.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 54 + } + ], + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return if the browser is closed." + } + ] + }, + "decorations": [ + { + "name": "property" + }, + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1081, + "module": "browsers._browser_controller", + "name": "is_browser_connected", + "parsedDocstring": { + "text": "Return if the browser is closed." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/browsers/_browser_controller.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 59 + } + ], + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the type of the browser." + } + ] + }, + "decorations": [ + { + "name": "property" + }, + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1082, + "module": "browsers._browser_controller", + "name": "browser_type", + "parsedDocstring": { + "text": "Return the type of the browser." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/browsers/_browser_controller.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 64 + } + ], + "type": { + "name": "BrowserType", + "type": "reference", + "target": "1135" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a new page with the given context options.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1083, + "module": "browsers._browser_controller", + "name": "new_page", + "parsedDocstring": { + "text": "Create a new page with the given context options.\n", + "args": { + "browser_new_context_options": "Keyword arguments to pass to the browser new context method. These options\nare provided directly to Playwright's `browser.new_context` method. For more details, refer to the\nPlaywright documentation: https://playwright.dev/python/docs/api/class-browser#browser-new-context.", + "proxy_info": "The proxy configuration to use for the new page.\n" + }, + "returns": "Page: The newly created page.\n" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/browsers/_browser_controller.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 68 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "Page: The newly created page.\n" + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Create a new page with the given context options.\n" + } + ] + }, + "flags": {}, + "id": 1084, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "new_page", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Keyword arguments to pass to the browser new context method. These options\nare provided directly to Playwright's `browser.new_context` method. For more details, refer to the\nPlaywright documentation: https://playwright.dev/python/docs/api/class-browser#browser-new-context." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1085, + "kind": 32768, + "kindString": "Parameter", + "name": "browser_new_context_options", + "type": { + "name": "Mapping[str, Any] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Mapping", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Any" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The proxy configuration to use for the new page.\n" + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1086, + "kind": 32768, + "kindString": "Parameter", + "name": "proxy_info", + "type": { + "name": "ProxyInfo | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ProxyInfo", + "target": "254" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "Page", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Close the browser.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1087, + "module": "browsers._browser_controller", + "name": "close", + "parsedDocstring": { + "text": "Close the browser.\n", + "args": { + "force": "Whether to force close all open pages before closing the browser.\n" + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/browsers/_browser_controller.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 89 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Close the browser.\n" + } + ] + }, + "flags": {}, + "id": 1088, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "close", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Whether to force close all open pages before closing the browser.\n" + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1089, + "kind": 32768, + "kindString": "Parameter", + "name": "force", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "An abstract base class for managing browser instance and their pages." + } + ] + }, + "decorations": [ + { + "args": "('Browser management')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 1087, + 1083 + ], + "title": "Methods" + }, + { + "children": [ + 1074, + 1082, + 1080, + 1079, + 1081, + 1078, + 1075, + 1077, + 1076 + ], + "title": "Properties" + } + ], + "id": 1073, + "module": "browsers._browser_controller", + "name": "BrowserController", + "parsedDocstring": { + "text": "An abstract base class for managing browser instance and their pages." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/browsers/_browser_controller.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 21 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedBy": [ + { + "name": "PlaywrightBrowserController", + "target": "1219", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The name of the automation library that the plugin is managing." + } + ] + }, + "flags": {}, + "groups": [], + "id": 1091, + "module": "browsers._browser_plugin", + "name": "AUTOMATION_LIBRARY", + "parsedDocstring": { + "text": "The name of the automation library that the plugin is managing." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/browsers/_browser_plugin.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 26 + } + ], + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Indicate whether the context is active." + } + ] + }, + "decorations": [ + { + "name": "property" + }, + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1092, + "module": "browsers._browser_plugin", + "name": "active", + "parsedDocstring": { + "text": "Indicate whether the context is active." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/browsers/_browser_plugin.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 31 + } + ], + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the browser type name." + } + ] + }, + "decorations": [ + { + "name": "property" + }, + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1093, + "module": "browsers._browser_plugin", + "name": "browser_type", + "parsedDocstring": { + "text": "Return the browser type name." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/browsers/_browser_plugin.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 36 + } + ], + "type": { + "name": "BrowserType", + "type": "reference", + "target": "1135" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the options for the `browser.launch` method.\n\nKeyword arguments to pass to the browser launch method. These options are provided directly to Playwright's\n`browser_type.launch` method. For more details, refer to the Playwright documentation:\nhttps://playwright.dev/python/docs/api/class-browsertype#browser-type-launch." + } + ] + }, + "decorations": [ + { + "name": "property" + }, + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1094, + "module": "browsers._browser_plugin", + "name": "browser_launch_options", + "parsedDocstring": { + "text": "Return the options for the `browser.launch` method.\n\nKeyword arguments to pass to the browser launch method. These options are provided directly to Playwright's\n`browser_type.launch` method. For more details, refer to the Playwright documentation:\nhttps://playwright.dev/python/docs/api/class-browsertype#browser-type-launch." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/browsers/_browser_plugin.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 41 + } + ], + "type": { + "name": "Mapping", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Any" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the options for the `browser.new_context` method.\n\nKeyword arguments to pass to the browser new context method. These options are provided directly to Playwright's\n`browser.new_context` method. For more details, refer to the Playwright documentation:\nhttps://playwright.dev/python/docs/api/class-browser#browser-new-context." + } + ] + }, + "decorations": [ + { + "name": "property" + }, + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1095, + "module": "browsers._browser_plugin", + "name": "browser_new_context_options", + "parsedDocstring": { + "text": "Return the options for the `browser.new_context` method.\n\nKeyword arguments to pass to the browser new context method. These options are provided directly to Playwright's\n`browser.new_context` method. For more details, refer to the Playwright documentation:\nhttps://playwright.dev/python/docs/api/class-browser#browser-new-context." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/browsers/_browser_plugin.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 51 + } + ], + "type": { + "name": "Mapping", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Any" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the maximum number of pages that can be opened in a single browser." + } + ] + }, + "decorations": [ + { + "name": "property" + }, + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1096, + "module": "browsers._browser_plugin", + "name": "max_open_pages_per_browser", + "parsedDocstring": { + "text": "Return the maximum number of pages that can be opened in a single browser." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/browsers/_browser_plugin.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 61 + } + ], + "type": { + "name": "int", + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Enter the context manager and initialize the browser plugin.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1097, + "module": "browsers._browser_plugin", + "name": "__aenter__", + "parsedDocstring": { + "text": "Enter the context manager and initialize the browser plugin.\n" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/browsers/_browser_plugin.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 65 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Enter the context manager and initialize the browser plugin.\n" + } + ] + }, + "flags": {}, + "id": 1098, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "__aenter__", + "parameters": [], + "type": { + "name": "BrowserPlugin", + "type": "reference", + "target": "1090" + }, + "overwrites": { + "name": "BrowserPlugin.__aenter__", + "target": 1097, + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Exit the context manager and close the browser plugin.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1099, + "module": "browsers._browser_plugin", + "name": "__aexit__", + "parsedDocstring": { + "text": "Exit the context manager and close the browser plugin.\n" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/browsers/_browser_plugin.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 73 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Exit the context manager and close the browser plugin.\n" + } + ] + }, + "flags": {}, + "id": 1100, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "__aexit__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1101, + "kind": 32768, + "kindString": "Parameter", + "name": "exc_type", + "type": { + "name": "type[BaseException] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "type", + "typeArguments": [ + { + "type": "reference", + "name": "BaseException" + } + ], + "target": "981" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1102, + "kind": 32768, + "kindString": "Parameter", + "name": "exc_value", + "type": { + "name": "BaseException | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "BaseException" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1103, + "kind": 32768, + "kindString": "Parameter", + "name": "exc_traceback", + "type": { + "name": "TracebackType | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "TracebackType" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "BrowserPlugin.__aexit__", + "target": 1099, + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a new browser instance.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1104, + "module": "browsers._browser_plugin", + "name": "new_browser", + "parsedDocstring": { + "text": "Create a new browser instance.\n", + "returns": "A new browser instance wrapped in a controller." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/browsers/_browser_plugin.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 86 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "A new browser instance wrapped in a controller." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Create a new browser instance.\n" + } + ] + }, + "flags": {}, + "id": 1105, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "new_browser", + "parameters": [], + "type": { + "name": "BrowserController", + "type": "reference", + "target": "1073" + }, + "overwrites": { + "name": "BrowserPlugin.new_browser", + "target": 1104, + "type": "reference" + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "An abstract base class for browser plugins.\n\nBrowser plugins act as wrappers around browser automation tools like Playwright,\nproviding a unified interface for interacting with browsers." + } + ] + }, + "decorations": [ + { + "args": "('Browser management')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 1097, + 1099, + 1104 + ], + "title": "Methods" + }, + { + "children": [ + 1092, + 1091, + 1094, + 1095, + 1093, + 1096 + ], + "title": "Properties" + } + ], + "id": 1090, + "module": "browsers._browser_plugin", + "name": "BrowserPlugin", + "parsedDocstring": { + "text": "An abstract base class for browser plugins.\n\nBrowser plugins act as wrappers around browser automation tools like Playwright,\nproviding a unified interface for interacting with browsers." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/browsers/_browser_plugin.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 19 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedBy": [ + { + "name": "PlaywrightBrowserPlugin", + "target": "1141", + "type": "reference" + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1106, + "module": "browsers._playwright_browser", + "name": "logger", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/browsers/_playwright_browser.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 18 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1108, + "module": "browsers._playwright_browser", + "name": "__init__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/browsers/_playwright_browser.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 32 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 1109, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1110, + "kind": 32768, + "kindString": "Parameter", + "name": "browser_type", + "type": { + "name": "BrowserType", + "type": "reference", + "target": "1135" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1111, + "kind": 32768, + "kindString": "Parameter", + "name": "user_data_dir", + "type": { + "name": "str | Path | None", + "type": "union", + "types": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Path" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1112, + "kind": 32768, + "kindString": "Parameter", + "name": "browser_launch_options", + "type": { + "name": "dict", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Any" + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 1113, + "module": "browsers._playwright_browser", + "name": "browser_type", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/browsers/_playwright_browser.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 47 + } + ], + "type": { + "name": "BrowserType", + "type": "reference", + "target": "1135" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 1114, + "module": "browsers._playwright_browser", + "name": "contexts", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/browsers/_playwright_browser.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 51 + } + ], + "type": { + "name": "list", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "BrowserContext" + } + ] + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1115, + "module": "browsers._playwright_browser", + "name": "is_connected", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/browsers/_playwright_browser.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 54 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 1116, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "is_connected", + "parameters": [], + "type": { + "name": "bool", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create persistent context instead of regular one. Merge launch options with context options." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1117, + "module": "browsers._playwright_browser", + "name": "new_context", + "parsedDocstring": { + "text": "Create persistent context instead of regular one. Merge launch options with context options." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/browsers/_playwright_browser.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 57 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create persistent context instead of regular one. Merge launch options with context options." + } + ] + }, + "flags": {}, + "id": 1118, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "new_context", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1119, + "kind": 32768, + "kindString": "Parameter", + "name": "context_options", + "type": { + "name": "Any", + "type": "reference" + } + } + ], + "type": { + "name": "BrowserContext", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Close browser by closing its context." + } + ] + }, + "decorations": [ + { + "name": "override" + } + ], + "flags": {}, + "groups": [], + "id": 1120, + "module": "browsers._playwright_browser", + "name": "close", + "parsedDocstring": { + "text": "Close browser by closing its context." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/browsers/_playwright_browser.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 85 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Close browser by closing its context." + } + ] + }, + "flags": {}, + "id": 1121, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "close", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1122, + "kind": 32768, + "kindString": "Parameter", + "name": "kwargs", + "type": { + "name": "Any", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "name": "property" + }, + { + "name": "override" + } + ], + "flags": {}, + "groups": [], + "id": 1123, + "module": "browsers._playwright_browser", + "name": "version", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/browsers/_playwright_browser.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 96 + } + ], + "type": { + "name": "str", + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1124, + "module": "browsers._playwright_browser", + "name": "new_page", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/browsers/_playwright_browser.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 99 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 1125, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "new_page", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1126, + "kind": 32768, + "kindString": "Parameter", + "name": "kwargs", + "type": { + "name": "Any", + "type": "reference" + } + } + ], + "type": { + "name": "Page", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "name": "override" + } + ], + "flags": {}, + "groups": [], + "id": 1127, + "module": "browsers._playwright_browser", + "name": "new_browser_cdp_session", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/browsers/_playwright_browser.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 103 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 1128, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "new_browser_cdp_session", + "parameters": [], + "type": { + "name": "CDPSession", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1129, + "module": "browsers._playwright_browser", + "name": "start_tracing", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/browsers/_playwright_browser.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 106 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 1130, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "start_tracing", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1131, + "kind": 32768, + "kindString": "Parameter", + "name": "kwargs", + "type": { + "name": "Any", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1132, + "module": "browsers._playwright_browser", + "name": "stop_tracing", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/browsers/_playwright_browser.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 109 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 1133, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "stop_tracing", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1134, + "kind": 32768, + "kindString": "Parameter", + "name": "kwargs", + "type": { + "name": "Any", + "type": "reference" + } + } + ], + "type": { + "name": "bytes", + "type": "reference", + "target": "682" + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A wrapper for Playwright's `Browser` that operates with a persistent context.\n\nIt utilizes Playwright's persistent browser context feature, maintaining user data across sessions.\nWhile it follows the same interface as Playwright's `Browser` class, there is no abstract base class\nenforcing this. There is a limitation that only a single persistent context is allowed." + } + ] + }, + "decorations": [ + { + "args": "('Browser management')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 1108, + 1120, + 1115, + 1127, + 1117, + 1124, + 1129, + 1132 + ], + "title": "Methods" + }, + { + "children": [ + 1113, + 1114, + 1123 + ], + "title": "Properties" + } + ], + "id": 1107, + "module": "browsers._playwright_browser", + "name": "PlaywrightPersistentBrowser", + "parsedDocstring": { + "text": "A wrapper for Playwright's `Browser` that operates with a persistent context.\n\nIt utilizes Playwright's persistent browser context feature, maintaining user data across sessions.\nWhile it follows the same interface as Playwright's `Browser` class, there is no abstract base class\nenforcing this. There is a limitation that only a single persistent context is allowed." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/browsers/_playwright_browser.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 22 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1135, + "module": "browsers._types", + "name": "BrowserType", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/browsers/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 9 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1137, + "module": "browsers._types", + "name": "id", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/browsers/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 16 + } + ], + "type": { + "name": "str", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1138, + "module": "browsers._types", + "name": "browser_type", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/browsers/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 17 + } + ], + "type": { + "name": "BrowserType", + "type": "reference", + "target": "1135" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1139, + "module": "browsers._types", + "name": "page", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/browsers/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 18 + } + ], + "type": { + "name": "Page", + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Represents a page object within a browser, with additional metadata for tracking and management." + } + ] + }, + "decorations": [ + { + "name": "dataclass" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 1138, + 1137, + 1139 + ], + "title": "Properties" + } + ], + "id": 1136, + "module": "browsers._types", + "name": "CrawleePage", + "parsedDocstring": { + "text": "Represents a page object within a browser, with additional metadata for tracking and management." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/browsers/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 13 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1140, + "module": "browsers._playwright_browser_plugin", + "name": "logger", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/browsers/_playwright_browser_plugin.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 28 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The name of the automation library that the plugin is managing." + } + ] + }, + "flags": {}, + "groups": [], + "id": 1142, + "module": "browsers._browser_plugin", + "name": "AUTOMATION_LIBRARY", + "parsedDocstring": { + "text": "The name of the automation library that the plugin is managing." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/browsers/_playwright_browser_plugin.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 42 + } + ], + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + }, + "overwrites": { + "name": "BrowserPlugin.AUTOMATION_LIBRARY", + "target": 1091, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1143, + "module": "browsers._playwright_browser_plugin", + "name": "__init__", + "parsedDocstring": { + "text": "Initialize a new instance.\n", + "args": { + "browser_type": "The type of browser to launch:\n- 'chromium', 'firefox', 'webkit': Use Playwright-managed browsers\n- 'chrome': Use your locally installed Google Chrome browser. Requires Google Chrome to be installed on\nthe system.", + "user_data_dir": "Path to a User Data Directory, which stores browser session data like cookies and local\nstorage.", + "browser_launch_options": "Keyword arguments to pass to the browser launch method. These options are provided\ndirectly to Playwright's `browser_type.launch` method. For more details, refer to the Playwright\ndocumentation: https://playwright.dev/python/docs/api/class-browsertype#browser-type-launch.", + "browser_new_context_options": "Keyword arguments to pass to the browser new context method. These options\nare provided directly to Playwright's `browser.new_context` method. For more details, refer to the\nPlaywright documentation: https://playwright.dev/python/docs/api/class-browser#browser-new-context.", + "max_open_pages_per_browser": "The maximum number of pages that can be opened in a single browser instance.\nOnce reached, a new browser instance will be launched to handle the excess.", + "use_incognito_pages": "By default pages share the same browser context. If set to True each page uses its\nown context that is destroyed once the page is closed or crashes.", + "fingerprint_generator": "An optional instance of implementation of `FingerprintGenerator` that is used\nto generate browser fingerprints together with consistent headers." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/browsers/_playwright_browser_plugin.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 44 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n" + } + ] + }, + "flags": {}, + "id": 1144, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The type of browser to launch:\n- 'chromium', 'firefox', 'webkit': Use Playwright-managed browsers\n- 'chrome': Use your locally installed Google Chrome browser. Requires Google Chrome to be installed on\nthe system." + } + ] + }, + "defaultValue": "'chromium'", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1145, + "kind": 32768, + "kindString": "Parameter", + "name": "browser_type", + "type": { + "name": "BrowserType", + "type": "reference", + "target": "1135" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Path to a User Data Directory, which stores browser session data like cookies and local\nstorage." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1146, + "kind": 32768, + "kindString": "Parameter", + "name": "user_data_dir", + "type": { + "name": "str | Path | None", + "type": "union", + "types": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Path" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Keyword arguments to pass to the browser launch method. These options are provided\ndirectly to Playwright's `browser_type.launch` method. For more details, refer to the Playwright\ndocumentation: https://playwright.dev/python/docs/api/class-browsertype#browser-type-launch." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1147, + "kind": 32768, + "kindString": "Parameter", + "name": "browser_launch_options", + "type": { + "name": "dict[str, Any] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "dict", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Any" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Keyword arguments to pass to the browser new context method. These options\nare provided directly to Playwright's `browser.new_context` method. For more details, refer to the\nPlaywright documentation: https://playwright.dev/python/docs/api/class-browser#browser-new-context." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1148, + "kind": 32768, + "kindString": "Parameter", + "name": "browser_new_context_options", + "type": { + "name": "dict[str, Any] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "dict", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Any" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The maximum number of pages that can be opened in a single browser instance.\nOnce reached, a new browser instance will be launched to handle the excess." + } + ] + }, + "defaultValue": "20", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1149, + "kind": 32768, + "kindString": "Parameter", + "name": "max_open_pages_per_browser", + "type": { + "name": "int", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "By default pages share the same browser context. If set to True each page uses its\nown context that is destroyed once the page is closed or crashes." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1150, + "kind": 32768, + "kindString": "Parameter", + "name": "use_incognito_pages", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "An optional instance of implementation of `FingerprintGenerator` that is used\nto generate browser fingerprints together with consistent headers." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1151, + "kind": 32768, + "kindString": "Parameter", + "name": "fingerprint_generator", + "type": { + "name": "FingerprintGenerator | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "FingerprintGenerator", + "target": "1979" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Indicate whether the context is active." + } + ] + }, + "decorations": [ + { + "name": "property" + }, + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1152, + "module": "browsers._browser_plugin", + "name": "active", + "parsedDocstring": { + "text": "Indicate whether the context is active." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/browsers/_playwright_browser_plugin.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 117 + } + ], + "type": { + "name": "bool", + "type": "reference" + }, + "overwrites": { + "name": "BrowserPlugin.active", + "target": 1092, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the browser type name." + } + ] + }, + "decorations": [ + { + "name": "property" + }, + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1153, + "module": "browsers._browser_plugin", + "name": "browser_type", + "parsedDocstring": { + "text": "Return the browser type name." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/browsers/_playwright_browser_plugin.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 122 + } + ], + "type": { + "name": "BrowserType", + "type": "reference", + "target": "1135" + }, + "overwrites": { + "name": "BrowserPlugin.browser_type", + "target": 1093, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the options for the `browser.launch` method.\n\nKeyword arguments to pass to the browser launch method. These options are provided directly to Playwright's\n`browser_type.launch` method. For more details, refer to the Playwright documentation:\nhttps://playwright.dev/python/docs/api/class-browsertype#browser-type-launch." + } + ] + }, + "decorations": [ + { + "name": "property" + }, + { + "name": "override" + } + ], + "flags": {}, + "groups": [], + "id": 1154, + "module": "browsers._playwright_browser_plugin", + "name": "browser_launch_options", + "parsedDocstring": { + "text": "Return the options for the `browser.launch` method.\n\nKeyword arguments to pass to the browser launch method. These options are provided directly to Playwright's\n`browser_type.launch` method. For more details, refer to the Playwright documentation:\nhttps://playwright.dev/python/docs/api/class-browsertype#browser-type-launch." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/browsers/_playwright_browser_plugin.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 127 + } + ], + "type": { + "name": "Mapping", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Any" + } + ] + }, + "overwrites": { + "name": "BrowserPlugin.browser_launch_options", + "target": 1094, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the options for the `browser.new_context` method.\n\nKeyword arguments to pass to the browser new context method. These options are provided directly to Playwright's\n`browser.new_context` method. For more details, refer to the Playwright documentation:\nhttps://playwright.dev/python/docs/api/class-browser#browser-new-context." + } + ] + }, + "decorations": [ + { + "name": "property" + }, + { + "name": "override" + } + ], + "flags": {}, + "groups": [], + "id": 1155, + "module": "browsers._playwright_browser_plugin", + "name": "browser_new_context_options", + "parsedDocstring": { + "text": "Return the options for the `browser.new_context` method.\n\nKeyword arguments to pass to the browser new context method. These options are provided directly to Playwright's\n`browser.new_context` method. For more details, refer to the Playwright documentation:\nhttps://playwright.dev/python/docs/api/class-browser#browser-new-context." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/browsers/_playwright_browser_plugin.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 138 + } + ], + "type": { + "name": "Mapping", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Any" + } + ] + }, + "overwrites": { + "name": "BrowserPlugin.browser_new_context_options", + "target": 1095, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the maximum number of pages that can be opened in a single browser." + } + ] + }, + "decorations": [ + { + "name": "property" + }, + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1156, + "module": "browsers._browser_plugin", + "name": "max_open_pages_per_browser", + "parsedDocstring": { + "text": "Return the maximum number of pages that can be opened in a single browser." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/browsers/_playwright_browser_plugin.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 149 + } + ], + "type": { + "name": "int", + "type": "reference" + }, + "overwrites": { + "name": "BrowserPlugin.max_open_pages_per_browser", + "target": 1096, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Enter the context manager and initialize the browser plugin.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1157, + "module": "browsers._browser_plugin", + "name": "__aenter__", + "parsedDocstring": { + "text": "Enter the context manager and initialize the browser plugin.\n" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/browsers/_playwright_browser_plugin.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 153 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Enter the context manager and initialize the browser plugin.\n" + } + ] + }, + "flags": {}, + "id": 1098, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "__aenter__", + "parameters": [], + "type": { + "name": "BrowserPlugin", + "type": "reference", + "target": "1090" + }, + "overwrites": { + "name": "BrowserPlugin.__aenter__", + "target": 1097, + "type": "reference" + } + } + ], + "overwrites": { + "name": "BrowserPlugin.__aenter__", + "target": 1097, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Exit the context manager and close the browser plugin.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1159, + "module": "browsers._browser_plugin", + "name": "__aexit__", + "parsedDocstring": { + "text": "Exit the context manager and close the browser plugin.\n" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/browsers/_playwright_browser_plugin.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 162 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Exit the context manager and close the browser plugin.\n" + } + ] + }, + "flags": {}, + "id": 1100, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "__aexit__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1101, + "kind": 32768, + "kindString": "Parameter", + "name": "exc_type", + "type": { + "name": "type[BaseException] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "type", + "typeArguments": [ + { + "type": "reference", + "name": "BaseException" + } + ], + "target": "981" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1102, + "kind": 32768, + "kindString": "Parameter", + "name": "exc_value", + "type": { + "name": "BaseException | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "BaseException" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1103, + "kind": 32768, + "kindString": "Parameter", + "name": "exc_traceback", + "type": { + "name": "TracebackType | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "TracebackType" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "BrowserPlugin.__aexit__", + "target": 1099, + "type": "reference" + } + } + ], + "overwrites": { + "name": "BrowserPlugin.__aexit__", + "target": 1099, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a new browser instance.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1164, + "module": "browsers._browser_plugin", + "name": "new_browser", + "parsedDocstring": { + "text": "Create a new browser instance.\n", + "returns": "A new browser instance wrapped in a controller." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/browsers/_playwright_browser_plugin.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 177 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "A new browser instance wrapped in a controller." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Create a new browser instance.\n" + } + ] + }, + "flags": {}, + "id": 1105, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "new_browser", + "parameters": [], + "type": { + "name": "BrowserController", + "type": "reference", + "target": "1073" + }, + "overwrites": { + "name": "BrowserPlugin.new_browser", + "target": 1104, + "type": "reference" + } + } + ], + "overwrites": { + "name": "BrowserPlugin.new_browser", + "target": 1104, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A plugin for managing Playwright automation library.\n\nIt is a plugin designed to manage browser instances using the Playwright automation library. It acts as a factory\nfor creating new browser instances and provides a unified interface for interacting with different browser types\n(chromium, firefox, webkit and chrome). This class integrates configuration options for browser launches (headless\nmode, executable paths, sandboxing, ...). It also manages browser contexts and the number of pages open within each\nbrowser instance, ensuring that resource limits are respected." + } + ] + }, + "decorations": [ + { + "args": "('Browser management')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 1157, + 1159, + 1143, + 1164 + ], + "title": "Methods" + }, + { + "children": [ + 1152, + 1142, + 1154, + 1155, + 1153, + 1156 + ], + "title": "Properties" + } + ], + "id": 1141, + "module": "browsers._playwright_browser_plugin", + "name": "PlaywrightBrowserPlugin", + "parsedDocstring": { + "text": "A plugin for managing Playwright automation library.\n\nIt is a plugin designed to manage browser instances using the Playwright automation library. It acts as a factory\nfor creating new browser instances and provides a unified interface for interacting with different browser types\n(chromium, firefox, webkit and chrome). This class integrates configuration options for browser launches (headless\nmode, executable paths, sandboxing, ...). It also manages browser contexts and the number of pages open within each\nbrowser instance, ensuring that resource limits are respected." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/browsers/_playwright_browser_plugin.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 32 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "BrowserPlugin", + "target": "1090", + "type": "reference" + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1166, + "module": "browsers._browser_pool", + "name": "logger", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/browsers/_browser_pool.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 30 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1168, + "module": "browsers._browser_pool", + "name": "__init__", + "parsedDocstring": { + "text": "Initialize a new instance.\n", + "args": { + "plugins": "Browser plugins serve as wrappers around various browser automation libraries,\nproviding a consistent interface across different libraries.", + "operation_timeout": "Operations of the underlying automation libraries, such as launching a browser\nor opening a new page, can sometimes get stuck. To prevent `BrowserPool` from becoming unresponsive,\nwe add a timeout to these operations.", + "browser_inactive_threshold": "The period of inactivity after which a browser is considered as inactive.", + "identify_inactive_browsers_interval": "The period of inactivity after which a browser is considered\nas retired.", + "close_inactive_browsers_interval": "The interval at which the pool checks for inactive browsers\nand closes them. The browser is considered as inactive if it has no active pages and has been idle\nfor the specified period. The browser is considered as retired if it has no active pages and has total\npages count greater than or equal to `retire_browser_after_page_count`.", + "retire_browser_after_page_count": "The maximum number of processed pages after which the browser is considered\nas retired." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/browsers/_browser_pool.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 48 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n" + } + ] + }, + "flags": {}, + "id": 1169, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Browser plugins serve as wrappers around various browser automation libraries,\nproviding a consistent interface across different libraries." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1170, + "kind": 32768, + "kindString": "Parameter", + "name": "plugins", + "type": { + "name": "Sequence[BrowserPlugin] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Sequence", + "typeArguments": [ + { + "type": "reference", + "name": "BrowserPlugin", + "target": "1090" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Operations of the underlying automation libraries, such as launching a browser\nor opening a new page, can sometimes get stuck. To prevent `BrowserPool` from becoming unresponsive,\nwe add a timeout to these operations." + } + ] + }, + "defaultValue": "timedelta(seconds=15)", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1171, + "kind": 32768, + "kindString": "Parameter", + "name": "operation_timeout", + "type": { + "name": "timedelta", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The period of inactivity after which a browser is considered as inactive." + } + ] + }, + "defaultValue": "timedelta(seconds=10)", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1172, + "kind": 32768, + "kindString": "Parameter", + "name": "browser_inactive_threshold", + "type": { + "name": "timedelta", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The period of inactivity after which a browser is considered\nas retired." + } + ] + }, + "defaultValue": "timedelta(seconds=20)", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1173, + "kind": 32768, + "kindString": "Parameter", + "name": "identify_inactive_browsers_interval", + "type": { + "name": "timedelta", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The interval at which the pool checks for inactive browsers\nand closes them. The browser is considered as inactive if it has no active pages and has been idle\nfor the specified period. The browser is considered as retired if it has no active pages and has total\npages count greater than or equal to `retire_browser_after_page_count`." + } + ] + }, + "defaultValue": "timedelta(seconds=30)", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1174, + "kind": 32768, + "kindString": "Parameter", + "name": "close_inactive_browsers_interval", + "type": { + "name": "timedelta", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The maximum number of processed pages after which the browser is considered\nas retired." + } + ] + }, + "defaultValue": "100", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1175, + "kind": 32768, + "kindString": "Parameter", + "name": "retire_browser_after_page_count", + "type": { + "name": "int", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance with a single `PlaywrightBrowserPlugin` configured with the provided options.\n" + } + ] + }, + "decorations": [ + { + "name": "classmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1176, + "module": "browsers._browser_pool", + "name": "with_default_plugin", + "parsedDocstring": { + "text": "Initialize a new instance with a single `PlaywrightBrowserPlugin` configured with the provided options.\n", + "args": { + "browser_type": "The type of browser to launch:\n- 'chromium', 'firefox', 'webkit': Use Playwright-managed browsers\n- 'chrome': Use your locally installed Google Chrome browser. Requires Google Chrome to be installed on\nthe system.", + "user_data_dir": "Path to a user data directory, which stores browser session data like cookies\nand local storage.", + "browser_launch_options": "Keyword arguments to pass to the browser launch method. These options are provided\ndirectly to Playwright's `browser_type.launch` method. For more details, refer to the Playwright\ndocumentation: https://playwright.dev/python/docs/api/class-browsertype#browser-type-launch.", + "browser_new_context_options": "Keyword arguments to pass to the browser new context method. These options\nare provided directly to Playwright's `browser.new_context` method. For more details, refer to the\nPlaywright documentation: https://playwright.dev/python/docs/api/class-browser#browser-new-context.", + "headless": "Whether to run the browser in headless mode.", + "fingerprint_generator": "An optional instance of implementation of `FingerprintGenerator` that is used\nto generate browser fingerprints together with consistent headers.", + "use_incognito_pages": "By default pages share the same browser context. If set to True each page uses its\nown context that is destroyed once the page is closed or crashes.", + "kwargs": "Additional arguments for default constructor." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/browsers/_browser_pool.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 113 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance with a single `PlaywrightBrowserPlugin` configured with the provided options.\n" + } + ] + }, + "flags": {}, + "id": 1177, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "with_default_plugin", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The type of browser to launch:\n- 'chromium', 'firefox', 'webkit': Use Playwright-managed browsers\n- 'chrome': Use your locally installed Google Chrome browser. Requires Google Chrome to be installed on\nthe system." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1178, + "kind": 32768, + "kindString": "Parameter", + "name": "browser_type", + "type": { + "name": "BrowserType | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "BrowserType", + "target": "1135" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Path to a user data directory, which stores browser session data like cookies\nand local storage." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1179, + "kind": 32768, + "kindString": "Parameter", + "name": "user_data_dir", + "type": { + "name": "str | Path | None", + "type": "union", + "types": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Path" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Keyword arguments to pass to the browser launch method. These options are provided\ndirectly to Playwright's `browser_type.launch` method. For more details, refer to the Playwright\ndocumentation: https://playwright.dev/python/docs/api/class-browsertype#browser-type-launch." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1180, + "kind": 32768, + "kindString": "Parameter", + "name": "browser_launch_options", + "type": { + "name": "Mapping[str, Any] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Mapping", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Any" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Keyword arguments to pass to the browser new context method. These options\nare provided directly to Playwright's `browser.new_context` method. For more details, refer to the\nPlaywright documentation: https://playwright.dev/python/docs/api/class-browser#browser-new-context." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1181, + "kind": 32768, + "kindString": "Parameter", + "name": "browser_new_context_options", + "type": { + "name": "Mapping[str, Any] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Mapping", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Any" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Whether to run the browser in headless mode." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1182, + "kind": 32768, + "kindString": "Parameter", + "name": "headless", + "type": { + "name": "bool | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "bool" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "An optional instance of implementation of `FingerprintGenerator` that is used\nto generate browser fingerprints together with consistent headers." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1183, + "kind": 32768, + "kindString": "Parameter", + "name": "fingerprint_generator", + "type": { + "name": "FingerprintGenerator | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "FingerprintGenerator", + "target": "1979" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "By default pages share the same browser context. If set to True each page uses its\nown context that is destroyed once the page is closed or crashes." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1184, + "kind": 32768, + "kindString": "Parameter", + "name": "use_incognito_pages", + "type": { + "name": "bool | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "bool" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Additional arguments for default constructor." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1185, + "kind": 32768, + "kindString": "Parameter", + "name": "kwargs", + "type": { + "name": "Any", + "type": "reference" + } + } + ], + "type": { + "name": "BrowserPool", + "type": "reference", + "target": "1167" + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the browser plugins." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 1186, + "module": "browsers._browser_pool", + "name": "plugins", + "parsedDocstring": { + "text": "Return the browser plugins." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/browsers/_browser_pool.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 170 + } + ], + "type": { + "name": "Sequence", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "BrowserPlugin", + "target": "1090" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the active browsers in the pool." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 1187, + "module": "browsers._browser_pool", + "name": "active_browsers", + "parsedDocstring": { + "text": "Return the active browsers in the pool." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/browsers/_browser_pool.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 175 + } + ], + "type": { + "name": "Sequence", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "BrowserController", + "target": "1073" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the inactive browsers in the pool." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 1188, + "module": "browsers._browser_pool", + "name": "inactive_browsers", + "parsedDocstring": { + "text": "Return the inactive browsers in the pool." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/browsers/_browser_pool.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 180 + } + ], + "type": { + "name": "Sequence", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "BrowserController", + "target": "1073" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the pages in the pool." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 1189, + "module": "browsers._browser_pool", + "name": "pages", + "parsedDocstring": { + "text": "Return the pages in the pool." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/browsers/_browser_pool.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 185 + } + ], + "type": { + "name": "Mapping", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "CrawleePage", + "target": "1136" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the total number of pages opened since the browser pool was launched." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 1190, + "module": "browsers._browser_pool", + "name": "total_pages_count", + "parsedDocstring": { + "text": "Return the total number of pages opened since the browser pool was launched." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/browsers/_browser_pool.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 190 + } + ], + "type": { + "name": "int", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Indicate whether the context is active." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 1191, + "module": "browsers._browser_pool", + "name": "active", + "parsedDocstring": { + "text": "Indicate whether the context is active." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/browsers/_browser_pool.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 195 + } + ], + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Enter the context manager and initialize all browser plugins.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1192, + "module": "browsers._browser_pool", + "name": "__aenter__", + "parsedDocstring": { + "text": "Enter the context manager and initialize all browser plugins.\n" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/browsers/_browser_pool.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 199 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Enter the context manager and initialize all browser plugins.\n" + } + ] + }, + "flags": {}, + "id": 1193, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "__aenter__", + "parameters": [], + "type": { + "name": "BrowserPool", + "type": "reference", + "target": "1167" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Exit the context manager and close all browser plugins.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1194, + "module": "browsers._browser_pool", + "name": "__aexit__", + "parsedDocstring": { + "text": "Exit the context manager and close all browser plugins.\n" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/browsers/_browser_pool.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 223 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Exit the context manager and close all browser plugins.\n" + } + ] + }, + "flags": {}, + "id": 1195, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "__aexit__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1196, + "kind": 32768, + "kindString": "Parameter", + "name": "exc_type", + "type": { + "name": "type[BaseException] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "type", + "typeArguments": [ + { + "type": "reference", + "name": "BaseException" + } + ], + "target": "981" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1197, + "kind": 32768, + "kindString": "Parameter", + "name": "exc_value", + "type": { + "name": "BaseException | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "BaseException" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1198, + "kind": 32768, + "kindString": "Parameter", + "name": "exc_traceback", + "type": { + "name": "TracebackType | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "TracebackType" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Open a new page in a browser using the specified or a random browser plugin.\n" + } + ] + }, + "decorations": [ + { + "name": "ensure_context" + } + ], + "flags": {}, + "groups": [], + "id": 1199, + "module": "browsers._browser_pool", + "name": "new_page", + "parsedDocstring": { + "text": "Open a new page in a browser using the specified or a random browser plugin.\n", + "args": { + "page_id": "The ID to assign to the new page. If not provided, a random ID is generated.", + "browser_plugin": "browser_plugin: The browser plugin to use for creating the new page.\nIf not provided, the next plugin in the rotation is used.", + "proxy_info": "The proxy configuration to use for the new page.\n" + }, + "returns": "The newly created browser page." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/browsers/_browser_pool.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 251 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "The newly created browser page." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Open a new page in a browser using the specified or a random browser plugin.\n" + } + ] + }, + "flags": {}, + "id": 1200, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "new_page", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The ID to assign to the new page. If not provided, a random ID is generated." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1201, + "kind": 32768, + "kindString": "Parameter", + "name": "page_id", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "browser_plugin: The browser plugin to use for creating the new page.\nIf not provided, the next plugin in the rotation is used." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1202, + "kind": 32768, + "kindString": "Parameter", + "name": "browser_plugin", + "type": { + "name": "BrowserPlugin | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "BrowserPlugin", + "target": "1090" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The proxy configuration to use for the new page.\n" + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1203, + "kind": 32768, + "kindString": "Parameter", + "name": "proxy_info", + "type": { + "name": "ProxyInfo | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ProxyInfo", + "target": "254" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "CrawleePage", + "type": "reference", + "target": "1136" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a new page with each browser plugin in the pool.\n\nThis method is useful for running scripts in multiple environments simultaneously, typically for testing\nor website analysis. Each page is created using a different browser plugin, allowing you to interact\nwith various browser types concurrently.\n" + } + ] + }, + "decorations": [ + { + "name": "ensure_context" + } + ], + "flags": {}, + "groups": [], + "id": 1204, + "module": "browsers._browser_pool", + "name": "new_page_with_each_plugin", + "parsedDocstring": { + "text": "Create a new page with each browser plugin in the pool.\n\nThis method is useful for running scripts in multiple environments simultaneously, typically for testing\nor website analysis. Each page is created using a different browser plugin, allowing you to interact\nwith various browser types concurrently.\n", + "returns": "A list of newly created pages, one for each plugin in the pool." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/browsers/_browser_pool.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 281 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "A list of newly created pages, one for each plugin in the pool." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Create a new page with each browser plugin in the pool.\n\nThis method is useful for running scripts in multiple environments simultaneously, typically for testing\nor website analysis. Each page is created using a different browser plugin, allowing you to interact\nwith various browser types concurrently.\n" + } + ] + }, + "flags": {}, + "id": 1205, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "new_page_with_each_plugin", + "parameters": [], + "type": { + "name": "Sequence", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "CrawleePage", + "target": "1136" + } + ] + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a hook to be called just before a new page is created.\n\nThe hook receives the page ID, `BrowserController`, `browser_new_context_options`, and `ProxyInfo`.\nNote that depending on the `BrowserController` implementation, `browser_new_context_options` may not\napply to every page individually. For example, `PlaywrightBrowserController` with\n``use_incognito_pages=False`` shares a single context across all pages, so the options are applied\nonly when the context is first created." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1206, + "module": "browsers._browser_pool", + "name": "pre_page_create_hook", + "parsedDocstring": { + "text": "Register a hook to be called just before a new page is created.\n\nThe hook receives the page ID, `BrowserController`, `browser_new_context_options`, and `ProxyInfo`.\nNote that depending on the `BrowserController` implementation, `browser_new_context_options` may not\napply to every page individually. For example, `PlaywrightBrowserController` with\n``use_incognito_pages=False`` shares a single context across all pages, so the options are applied\nonly when the context is first created." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/browsers/_browser_pool.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 398 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a hook to be called just before a new page is created.\n\nThe hook receives the page ID, `BrowserController`, `browser_new_context_options`, and `ProxyInfo`.\nNote that depending on the `BrowserController` implementation, `browser_new_context_options` may not\napply to every page individually. For example, `PlaywrightBrowserController` with\n``use_incognito_pages=False`` shares a single context across all pages, so the options are applied\nonly when the context is first created." + } + ] + }, + "flags": {}, + "id": 1207, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "pre_page_create_hook", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1208, + "kind": 32768, + "kindString": "Parameter", + "name": "hook", + "type": { + "name": "Callable", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "[str, BrowserController, dict[str, Any], ProxyInfo | None]" + }, + { + "type": "reference", + "name": "Awaitable", + "typeArguments": [ + { + "type": "literal", + "value": null + } + ] + } + ] + } + } + ], + "type": { + "name": "Callable", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "[str, BrowserController, dict[str, Any], ProxyInfo | None]" + }, + { + "type": "reference", + "name": "Awaitable", + "typeArguments": [ + { + "type": "literal", + "value": null + } + ] + } + ] + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a hook to be called right after a new page is created.\n\nThe hook receives the newly created `CrawleePage` and the `BrowserController`. Use it to apply\nchanges to all pages, such as injecting scripts or configuring request interception." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1209, + "module": "browsers._browser_pool", + "name": "post_page_create_hook", + "parsedDocstring": { + "text": "Register a hook to be called right after a new page is created.\n\nThe hook receives the newly created `CrawleePage` and the `BrowserController`. Use it to apply\nchanges to all pages, such as injecting scripts or configuring request interception." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/browsers/_browser_pool.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 413 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a hook to be called right after a new page is created.\n\nThe hook receives the newly created `CrawleePage` and the `BrowserController`. Use it to apply\nchanges to all pages, such as injecting scripts or configuring request interception." + } + ] + }, + "flags": {}, + "id": 1210, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "post_page_create_hook", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1211, + "kind": 32768, + "kindString": "Parameter", + "name": "hook", + "type": { + "name": "Callable", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "[CrawleePage, BrowserController]" + }, + { + "type": "reference", + "name": "Awaitable", + "typeArguments": [ + { + "type": "literal", + "value": null + } + ] + } + ] + } + } + ], + "type": { + "name": "Callable", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "[CrawleePage, BrowserController]" + }, + { + "type": "reference", + "name": "Awaitable", + "typeArguments": [ + { + "type": "literal", + "value": null + } + ] + } + ] + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a hook to be called just before a page is closed.\n\nThe hook receives the `CrawleePage` and the `BrowserController`. Use it to collect last-second data,\nsuch as taking a screenshot or saving page state before the page is destroyed." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1212, + "module": "browsers._browser_pool", + "name": "pre_page_close_hook", + "parsedDocstring": { + "text": "Register a hook to be called just before a page is closed.\n\nThe hook receives the `CrawleePage` and the `BrowserController`. Use it to collect last-second data,\nsuch as taking a screenshot or saving page state before the page is destroyed." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/browsers/_browser_pool.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 425 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a hook to be called just before a page is closed.\n\nThe hook receives the `CrawleePage` and the `BrowserController`. Use it to collect last-second data,\nsuch as taking a screenshot or saving page state before the page is destroyed." + } + ] + }, + "flags": {}, + "id": 1213, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "pre_page_close_hook", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1214, + "kind": 32768, + "kindString": "Parameter", + "name": "hook", + "type": { + "name": "Callable", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "[CrawleePage, BrowserController]" + }, + { + "type": "reference", + "name": "Awaitable", + "typeArguments": [ + { + "type": "literal", + "value": null + } + ] + } + ] + } + } + ], + "type": { + "name": "Callable", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "[CrawleePage, BrowserController]" + }, + { + "type": "reference", + "name": "Awaitable", + "typeArguments": [ + { + "type": "literal", + "value": null + } + ] + } + ] + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a hook to be called right after a page is closed.\n\nThe hook receives the page ID and the `BrowserController`. Use it for cleanup or logging\nafter a page's lifecycle ends." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1215, + "module": "browsers._browser_pool", + "name": "post_page_close_hook", + "parsedDocstring": { + "text": "Register a hook to be called right after a page is closed.\n\nThe hook receives the page ID and the `BrowserController`. Use it for cleanup or logging\nafter a page's lifecycle ends." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/browsers/_browser_pool.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 437 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a hook to be called right after a page is closed.\n\nThe hook receives the page ID and the `BrowserController`. Use it for cleanup or logging\nafter a page's lifecycle ends." + } + ] + }, + "flags": {}, + "id": 1216, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "post_page_close_hook", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1217, + "kind": 32768, + "kindString": "Parameter", + "name": "hook", + "type": { + "name": "Callable", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "[str, BrowserController]" + }, + { + "type": "reference", + "name": "Awaitable", + "typeArguments": [ + { + "type": "literal", + "value": null + } + ] + } + ] + } + } + ], + "type": { + "name": "Callable", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "[str, BrowserController]" + }, + { + "type": "reference", + "name": "Awaitable", + "typeArguments": [ + { + "type": "literal", + "value": null + } + ] + } + ] + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Manage a pool of browsers and pages, handling their lifecycle and resource allocation.\n\nThe `BrowserPool` is responsible for opening and closing browsers, managing pages within those browsers,\nand handling the overall lifecycle of these resources. It provides flexible configuration via\nconstructor options, which include various hooks that allow for the insertion of custom behavior\nat different stages of the browser and page lifecycles.\n\nThe browsers in the pool can be in one of three states: active, inactive, or closed." + } + ] + }, + "decorations": [ + { + "args": "('Browser management')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 1192, + 1194, + 1168, + 1199, + 1204, + 1215, + 1209, + 1212, + 1206, + 1176 + ], + "title": "Methods" + }, + { + "children": [ + 1191, + 1187, + 1188, + 1189, + 1186, + 1190 + ], + "title": "Properties" + } + ], + "id": 1167, + "module": "browsers._browser_pool", + "name": "BrowserPool", + "parsedDocstring": { + "text": "Manage a pool of browsers and pages, handling their lifecycle and resource allocation.\n\nThe `BrowserPool` is responsible for opening and closing browsers, managing pages within those browsers,\nand handling the overall lifecycle of these resources. It provides flexible configuration via\nconstructor options, which include various hooks that allow for the insertion of custom behavior\nat different stages of the browser and page lifecycles.\n\nThe browsers in the pool can be in one of three states: active, inactive, or closed." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/browsers/_browser_pool.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 34 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1218, + "module": "browsers._playwright_browser_controller", + "name": "logger", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/browsers/_playwright_browser_controller.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 28 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The name of the automation library that the controller is using." + } + ] + }, + "flags": {}, + "groups": [], + "id": 1220, + "module": "browsers._browser_controller", + "name": "AUTOMATION_LIBRARY", + "parsedDocstring": { + "text": "The name of the automation library that the controller is using." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/browsers/_playwright_browser_controller.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 39 + } + ], + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + }, + "overwrites": { + "name": "BrowserController.AUTOMATION_LIBRARY", + "target": 1074, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1221, + "module": "browsers._playwright_browser_controller", + "name": "__init__", + "parsedDocstring": { + "text": "Initialize a new instance.\n", + "args": { + "browser": "The browser instance to control.", + "max_open_pages_per_browser": "The maximum number of pages that can be open at the same time.", + "use_incognito_pages": "By default pages share the same browser context. If set to True each page uses its\nown context that is destroyed once the page is closed or crashes.", + "header_generator": "An optional `HeaderGenerator` instance used to generate and manage HTTP headers for\nrequests made by the browser. By default, a predefined header generator is used. Set to `None` to\ndisable automatic header modifications.", + "fingerprint_generator": "An optional instance of implementation of `FingerprintGenerator` that is used\nto generate browser fingerprints together with consistent headers." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/browsers/_playwright_browser_controller.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 42 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n" + } + ] + }, + "flags": {}, + "id": 1222, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The browser instance to control." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1223, + "kind": 32768, + "kindString": "Parameter", + "name": "browser", + "type": { + "name": "Browser | PlaywrightPersistentBrowser", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Browser" + }, + { + "type": "reference", + "name": "PlaywrightPersistentBrowser", + "target": "1107" + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The maximum number of pages that can be open at the same time." + } + ] + }, + "defaultValue": "20", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1224, + "kind": 32768, + "kindString": "Parameter", + "name": "max_open_pages_per_browser", + "type": { + "name": "int", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "By default pages share the same browser context. If set to True each page uses its\nown context that is destroyed once the page is closed or crashes." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1225, + "kind": 32768, + "kindString": "Parameter", + "name": "use_incognito_pages", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "An optional `HeaderGenerator` instance used to generate and manage HTTP headers for\nrequests made by the browser. By default, a predefined header generator is used. Set to `None` to\ndisable automatic header modifications." + } + ] + }, + "defaultValue": "_DEFAULT_HEADER_GENERATOR", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1226, + "kind": 32768, + "kindString": "Parameter", + "name": "header_generator", + "type": { + "name": "HeaderGenerator | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "HeaderGenerator", + "target": "1985" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "An optional instance of implementation of `FingerprintGenerator` that is used\nto generate browser fingerprints together with consistent headers." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1227, + "kind": 32768, + "kindString": "Parameter", + "name": "fingerprint_generator", + "type": { + "name": "FingerprintGenerator | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "FingerprintGenerator", + "target": "1979" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the list of opened pages." + } + ] + }, + "decorations": [ + { + "name": "property" + }, + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1228, + "module": "browsers._browser_controller", + "name": "pages", + "parsedDocstring": { + "text": "Return the list of opened pages." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/browsers/_playwright_browser_controller.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 99 + } + ], + "type": { + "name": "list", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Page" + } + ] + }, + "overwrites": { + "name": "BrowserController.pages", + "target": 1075, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the total number of pages opened since the browser was launched." + } + ] + }, + "decorations": [ + { + "name": "property" + }, + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1229, + "module": "browsers._browser_controller", + "name": "total_opened_pages", + "parsedDocstring": { + "text": "Return the total number of pages opened since the browser was launched." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/browsers/_playwright_browser_controller.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 104 + } + ], + "type": { + "name": "int", + "type": "reference" + }, + "overwrites": { + "name": "BrowserController.total_opened_pages", + "target": 1076, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the number of currently open pages." + } + ] + }, + "decorations": [ + { + "name": "property" + }, + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1230, + "module": "browsers._browser_controller", + "name": "pages_count", + "parsedDocstring": { + "text": "Return the number of currently open pages." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/browsers/_playwright_browser_controller.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 109 + } + ], + "type": { + "name": "int", + "type": "reference" + }, + "overwrites": { + "name": "BrowserController.pages_count", + "target": 1077, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the time when the last page was opened." + } + ] + }, + "decorations": [ + { + "name": "property" + }, + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1231, + "module": "browsers._browser_controller", + "name": "last_page_opened_at", + "parsedDocstring": { + "text": "Return the time when the last page was opened." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/browsers/_playwright_browser_controller.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 114 + } + ], + "type": { + "name": "datetime", + "type": "reference" + }, + "overwrites": { + "name": "BrowserController.last_page_opened_at", + "target": 1078, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the idle time of the browser controller." + } + ] + }, + "decorations": [ + { + "name": "property" + }, + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1232, + "module": "browsers._browser_controller", + "name": "idle_time", + "parsedDocstring": { + "text": "Return the idle time of the browser controller." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/browsers/_playwright_browser_controller.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 119 + } + ], + "type": { + "name": "timedelta", + "type": "reference" + }, + "overwrites": { + "name": "BrowserController.idle_time", + "target": 1079, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return if the browser has free capacity to open a new page." + } + ] + }, + "decorations": [ + { + "name": "property" + }, + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1233, + "module": "browsers._browser_controller", + "name": "has_free_capacity", + "parsedDocstring": { + "text": "Return if the browser has free capacity to open a new page." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/browsers/_playwright_browser_controller.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 124 + } + ], + "type": { + "name": "bool", + "type": "reference" + }, + "overwrites": { + "name": "BrowserController.has_free_capacity", + "target": 1080, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return if the browser is closed." + } + ] + }, + "decorations": [ + { + "name": "property" + }, + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1234, + "module": "browsers._browser_controller", + "name": "is_browser_connected", + "parsedDocstring": { + "text": "Return if the browser is closed." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/browsers/_playwright_browser_controller.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 129 + } + ], + "type": { + "name": "bool", + "type": "reference" + }, + "overwrites": { + "name": "BrowserController.is_browser_connected", + "target": 1081, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the type of the browser." + } + ] + }, + "decorations": [ + { + "name": "property" + }, + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1235, + "module": "browsers._browser_controller", + "name": "browser_type", + "parsedDocstring": { + "text": "Return the type of the browser." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/browsers/_playwright_browser_controller.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 134 + } + ], + "type": { + "name": "BrowserType", + "type": "reference", + "target": "1135" + }, + "overwrites": { + "name": "BrowserController.browser_type", + "target": 1082, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a new page with the given context options.\n" + } + ] + }, + "decorations": [ + { + "name": "override" + } + ], + "flags": {}, + "groups": [], + "id": 1236, + "module": "browsers._playwright_browser_controller", + "name": "new_page", + "parsedDocstring": { + "text": "Create a new page with the given context options.\n", + "args": { + "browser_new_context_options": "Keyword arguments to pass to the browser new context method. These options\nare provided directly to Playwright's `browser.new_context` method. For more details, refer to the\nPlaywright documentation: https://playwright.dev/python/docs/api/class-browser#browser-new-context.", + "proxy_info": "The proxy configuration to use for the new page.\n" + }, + "returns": "Page: The newly created page.\n" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/browsers/_playwright_browser_controller.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 138 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "Page: The newly created page.\n" + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Create a new page with the given context options.\n" + } + ] + }, + "flags": {}, + "id": 1237, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "new_page", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Keyword arguments to pass to the browser new context method. These options\nare provided directly to Playwright's `browser.new_context` method. For more details, refer to the\nPlaywright documentation: https://playwright.dev/python/docs/api/class-browser#browser-new-context." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1238, + "kind": 32768, + "kindString": "Parameter", + "name": "browser_new_context_options", + "type": { + "name": "Mapping[str, Any] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Mapping", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Any" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The proxy configuration to use for the new page.\n" + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1239, + "kind": 32768, + "kindString": "Parameter", + "name": "proxy_info", + "type": { + "name": "ProxyInfo | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ProxyInfo", + "target": "254" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "Page", + "type": "reference" + }, + "overwrites": { + "name": "BrowserController.new_page", + "target": 1083, + "type": "reference" + } + } + ], + "overwrites": { + "name": "BrowserController.new_page", + "target": 1083, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Close the browser.\n" + } + ] + }, + "decorations": [ + { + "name": "override" + } + ], + "flags": {}, + "groups": [], + "id": 1240, + "module": "browsers._playwright_browser_controller", + "name": "close", + "parsedDocstring": { + "text": "Close the browser.\n", + "args": { + "force": "Whether to force close all open pages before closing the browser.\n" + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/browsers/_playwright_browser_controller.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 192 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Close the browser.\n" + } + ] + }, + "flags": {}, + "id": 1241, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "close", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Whether to force close all open pages before closing the browser.\n" + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1242, + "kind": 32768, + "kindString": "Parameter", + "name": "force", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "BrowserController.close", + "target": 1087, + "type": "reference" + } + } + ], + "overwrites": { + "name": "BrowserController.close", + "target": 1087, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Controller for managing Playwright browser instances and their pages.\n\nIt provides methods to control browser instances, manage their pages, and handle context-specific\nconfigurations. It enforces limits on the number of open pages and tracks their state." + } + ] + }, + "decorations": [ + { + "args": "('Browser management')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 1221, + 1240, + 1236 + ], + "title": "Methods" + }, + { + "children": [ + 1220, + 1235, + 1233, + 1232, + 1234, + 1231, + 1228, + 1230, + 1229 + ], + "title": "Properties" + } + ], + "id": 1219, + "module": "browsers._playwright_browser_controller", + "name": "PlaywrightBrowserController", + "parsedDocstring": { + "text": "Controller for managing Playwright browser instances and their pages.\n\nIt provides methods to control browser instances, manage their pages, and handle context-specific\nconfigurations. It enforces limits on the number of open pages and tracks their state." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/browsers/_playwright_browser_controller.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 32 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "BrowserController", + "target": "1073", + "type": "reference" + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1243, + "module": "crawlers._abstract_http._http_crawling_context", + "name": "TParseResult", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 12 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1244, + "module": "crawlers._abstract_http._http_crawling_context", + "name": "TSelectResult", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 13 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance from an existing `BasicCrawlingContext`." + } + ] + }, + "decorations": [ + { + "name": "classmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1246, + "module": "crawlers._abstract_http._http_crawling_context", + "name": "from_basic_crawling_context", + "parsedDocstring": { + "text": "Initialize a new instance from an existing `BasicCrawlingContext`." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 22 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance from an existing `BasicCrawlingContext`." + } + ] + }, + "flags": {}, + "id": 1247, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "from_basic_crawling_context", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1248, + "kind": 32768, + "kindString": "Parameter", + "name": "context", + "type": { + "name": "BasicCrawlingContext", + "type": "reference", + "target": "504" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1249, + "kind": 32768, + "kindString": "Parameter", + "name": "http_response", + "type": { + "name": "HttpResponse", + "type": "reference", + "target": "2060" + } + } + ], + "type": { + "name": "Self", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get snapshot of crawled page." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1250, + "module": "crawlers._abstract_http._http_crawling_context", + "name": "get_snapshot", + "parsedDocstring": { + "text": "Get snapshot of crawled page." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 27 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get snapshot of crawled page." + } + ] + }, + "flags": {}, + "id": 1251, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_snapshot", + "parameters": [], + "type": { + "name": "PageSnapshot", + "type": "reference", + "target": "495" + }, + "overwrites": { + "name": "BasicCrawlingContext.get_snapshot", + "target": 514, + "type": "reference" + } + } + ], + "overwrites": { + "name": "BasicCrawlingContext.get_snapshot", + "target": 514, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The HTTP response received from the server." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4243, + "module": "http_clients._base", + "name": "http_response", + "parsedDocstring": { + "text": "The HTTP response received from the server." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/http_clients/_base.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 70 + } + ], + "type": { + "name": "HttpResponse", + "type": "reference", + "target": "2060" + }, + "inheritedFrom": { + "name": "HttpCrawlingResult.http_response", + "target": 2069, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Request object for the current page being processed." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4244, + "module": "_types", + "name": "request", + "parsedDocstring": { + "text": "Request object for the current page being processed." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 637 + } + ], + "type": { + "name": "Request", + "type": "reference", + "target": "150" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.request", + "target": 505, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Session object for the current page being processed." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4245, + "module": "_types", + "name": "session", + "parsedDocstring": { + "text": "Session object for the current page being processed." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 640 + } + ], + "type": { + "name": "Session | None", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.session", + "target": 506, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Proxy information for the current page being processed." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4246, + "module": "_types", + "name": "proxy_info", + "parsedDocstring": { + "text": "Proxy information for the current page being processed." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 643 + } + ], + "type": { + "name": "ProxyInfo | None", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.proxy_info", + "target": 507, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Send request crawling context helper function." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4247, + "module": "_types", + "name": "send_request", + "parsedDocstring": { + "text": "Send request crawling context helper function." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 646 + } + ], + "type": { + "name": "SendRequestFunction", + "type": "reference", + "target": "488" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.send_request", + "target": 508, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Add requests crawling context helper function." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4248, + "module": "_types", + "name": "add_requests", + "parsedDocstring": { + "text": "Add requests crawling context helper function." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 649 + } + ], + "type": { + "name": "AddRequestsFunction", + "type": "reference", + "target": "422" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.add_requests", + "target": 509, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Push data crawling context helper function." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4249, + "module": "_types", + "name": "push_data", + "parsedDocstring": { + "text": "Push data crawling context helper function." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 652 + } + ], + "type": { + "name": "PushDataFunction", + "type": "reference", + "target": "480" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.push_data", + "target": 510, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Use state crawling context helper function." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4250, + "module": "_types", + "name": "use_state", + "parsedDocstring": { + "text": "Use state crawling context helper function." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 655 + } + ], + "type": { + "name": "UseStateFunction", + "type": "reference", + "target": "500" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.use_state", + "target": 511, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get key-value store crawling context helper function." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4251, + "module": "_types", + "name": "get_key_value_store", + "parsedDocstring": { + "text": "Get key-value store crawling context helper function." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 658 + } + ], + "type": { + "name": "GetKeyValueStoreFromRequestHandlerFunction", + "type": "reference", + "target": "474" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.get_key_value_store", + "target": 512, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Logger instance." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4252, + "module": "_types", + "name": "log", + "parsedDocstring": { + "text": "Logger instance." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 661 + } + ], + "type": { + "name": "logging.Logger", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.log", + "target": 513, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return hash of the context. Each context is considered unique." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4253, + "module": "_types", + "name": "__hash__", + "parsedDocstring": { + "text": "Return hash of the context. Each context is considered unique." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 668 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return hash of the context. Each context is considered unique." + } + ] + }, + "flags": {}, + "id": 517, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__hash__", + "parameters": [], + "type": { + "name": "int", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.__hash__", + "target": 516, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawlingContext.__hash__", + "target": 516, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a modified copy of the crawling context with specified changes." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4254, + "module": "_types", + "name": "create_modified_copy", + "parsedDocstring": { + "text": "Create a modified copy of the crawling context with specified changes." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 672 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a modified copy of the crawling context with specified changes." + } + ] + }, + "flags": {}, + "id": 519, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "create_modified_copy", + "parameters": [ + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 520, + "kind": 32768, + "kindString": "Parameter", + "name": "push_data", + "type": { + "name": "PushDataFunction | None", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 521, + "kind": 32768, + "kindString": "Parameter", + "name": "add_requests", + "type": { + "name": "AddRequestsFunction | None", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 522, + "kind": 32768, + "kindString": "Parameter", + "name": "get_key_value_store", + "type": { + "name": "GetKeyValueStoreFromRequestHandlerFunction | None", + "type": "reference" + } + } + ], + "type": { + "name": "Self", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.create_modified_copy", + "target": 518, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawlingContext.create_modified_copy", + "target": 518, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The crawling context used by the `AbstractHttpCrawler`." + } + ] + }, + "decorations": [ + { + "args": "(frozen=True)", + "name": "dataclass" + }, + { + "args": "('Crawling contexts')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 4253, + 4254, + 1246, + 1250 + ], + "title": "Methods" + }, + { + "children": [ + 4248, + 4251, + 4243, + 4252, + 4246, + 4249, + 4244, + 4247, + 4245, + 4250 + ], + "title": "Properties" + } + ], + "id": 1245, + "module": "crawlers._abstract_http._http_crawling_context", + "name": "HttpCrawlingContext", + "parsedDocstring": { + "text": "The crawling context used by the `AbstractHttpCrawler`." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 18 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "HttpCrawlingResult", + "target": "2068", + "type": "reference" + }, + { + "name": "BasicCrawlingContext", + "target": "504", + "type": "reference" + } + ], + "extendedBy": [ + { + "name": "ParsedHttpCrawlingContext", + "target": "1252", + "type": "reference" + }, + { + "name": "AdaptivePlaywrightPostNavCrawlingContext", + "target": "1477", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1253, + "module": "crawlers._abstract_http._http_crawling_context", + "name": "parsed_content", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 40 + } + ], + "type": { + "name": "TParseResult", + "type": "reference", + "target": "1243" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1254, + "module": "crawlers._abstract_http._http_crawling_context", + "name": "enqueue_links", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 41 + } + ], + "type": { + "name": "EnqueueLinksFunction", + "type": "reference", + "target": "430" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1255, + "module": "crawlers._abstract_http._http_crawling_context", + "name": "extract_links", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 42 + } + ], + "type": { + "name": "ExtractLinksFunction", + "type": "reference", + "target": "459" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance from an existing `HttpCrawlingContext`." + } + ] + }, + "decorations": [ + { + "name": "classmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1256, + "module": "crawlers._abstract_http._http_crawling_context", + "name": "from_http_crawling_context", + "parsedDocstring": { + "text": "Initialize a new instance from an existing `HttpCrawlingContext`." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 45 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance from an existing `HttpCrawlingContext`." + } + ] + }, + "flags": {}, + "id": 1257, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "from_http_crawling_context", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1258, + "kind": 32768, + "kindString": "Parameter", + "name": "context", + "type": { + "name": "HttpCrawlingContext", + "type": "reference", + "target": "1245" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1259, + "kind": 32768, + "kindString": "Parameter", + "name": "parsed_content", + "type": { + "name": "TParseResult", + "type": "reference", + "target": "1243" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1260, + "kind": 32768, + "kindString": "Parameter", + "name": "enqueue_links", + "type": { + "name": "EnqueueLinksFunction", + "type": "reference", + "target": "430" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1261, + "kind": 32768, + "kindString": "Parameter", + "name": "extract_links", + "type": { + "name": "ExtractLinksFunction", + "type": "reference", + "target": "459" + } + } + ], + "type": { + "name": "Self", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance from an existing `BasicCrawlingContext`." + } + ] + }, + "decorations": [ + { + "name": "classmethod" + } + ], + "flags": {}, + "groups": [], + "id": 4309, + "module": "crawlers._abstract_http._http_crawling_context", + "name": "from_basic_crawling_context", + "parsedDocstring": { + "text": "Initialize a new instance from an existing `BasicCrawlingContext`." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 22 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance from an existing `BasicCrawlingContext`." + } + ] + }, + "flags": {}, + "id": 1247, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "from_basic_crawling_context", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1248, + "kind": 32768, + "kindString": "Parameter", + "name": "context", + "type": { + "name": "BasicCrawlingContext", + "type": "reference", + "target": "504" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1249, + "kind": 32768, + "kindString": "Parameter", + "name": "http_response", + "type": { + "name": "HttpResponse", + "type": "reference", + "target": "2060" + } + } + ], + "type": { + "name": "Self", + "type": "reference" + }, + "inheritedFrom": { + "name": "HttpCrawlingContext.from_basic_crawling_context", + "target": 1246, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "HttpCrawlingContext.from_basic_crawling_context", + "target": 1246, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get snapshot of crawled page." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4310, + "module": "crawlers._abstract_http._http_crawling_context", + "name": "get_snapshot", + "parsedDocstring": { + "text": "Get snapshot of crawled page." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 27 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get snapshot of crawled page." + } + ] + }, + "flags": {}, + "id": 1251, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_snapshot", + "parameters": [], + "type": { + "name": "PageSnapshot", + "type": "reference", + "target": "495" + }, + "overwrites": { + "name": "BasicCrawlingContext.get_snapshot", + "target": 514, + "type": "reference" + }, + "inheritedFrom": { + "name": "HttpCrawlingContext.get_snapshot", + "target": 1250, + "type": "reference" + } + } + ], + "overwrites": { + "name": "BasicCrawlingContext.get_snapshot", + "target": 514, + "type": "reference" + }, + "inheritedFrom": { + "name": "HttpCrawlingContext.get_snapshot", + "target": 1250, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The HTTP response received from the server." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4311, + "module": "http_clients._base", + "name": "http_response", + "parsedDocstring": { + "text": "The HTTP response received from the server." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/http_clients/_base.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 70 + } + ], + "type": { + "name": "HttpResponse", + "type": "reference", + "target": "2060" + }, + "inheritedFrom": { + "name": "HttpCrawlingResult.http_response", + "target": 2069, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Request object for the current page being processed." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4312, + "module": "_types", + "name": "request", + "parsedDocstring": { + "text": "Request object for the current page being processed." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 637 + } + ], + "type": { + "name": "Request", + "type": "reference", + "target": "150" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.request", + "target": 505, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Session object for the current page being processed." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4313, + "module": "_types", + "name": "session", + "parsedDocstring": { + "text": "Session object for the current page being processed." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 640 + } + ], + "type": { + "name": "Session | None", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.session", + "target": 506, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Proxy information for the current page being processed." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4314, + "module": "_types", + "name": "proxy_info", + "parsedDocstring": { + "text": "Proxy information for the current page being processed." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 643 + } + ], + "type": { + "name": "ProxyInfo | None", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.proxy_info", + "target": 507, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Send request crawling context helper function." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4315, + "module": "_types", + "name": "send_request", + "parsedDocstring": { + "text": "Send request crawling context helper function." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 646 + } + ], + "type": { + "name": "SendRequestFunction", + "type": "reference", + "target": "488" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.send_request", + "target": 508, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Add requests crawling context helper function." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4316, + "module": "_types", + "name": "add_requests", + "parsedDocstring": { + "text": "Add requests crawling context helper function." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 649 + } + ], + "type": { + "name": "AddRequestsFunction", + "type": "reference", + "target": "422" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.add_requests", + "target": 509, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Push data crawling context helper function." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4317, + "module": "_types", + "name": "push_data", + "parsedDocstring": { + "text": "Push data crawling context helper function." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 652 + } + ], + "type": { + "name": "PushDataFunction", + "type": "reference", + "target": "480" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.push_data", + "target": 510, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Use state crawling context helper function." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4318, + "module": "_types", + "name": "use_state", + "parsedDocstring": { + "text": "Use state crawling context helper function." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 655 + } + ], + "type": { + "name": "UseStateFunction", + "type": "reference", + "target": "500" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.use_state", + "target": 511, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get key-value store crawling context helper function." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4319, + "module": "_types", + "name": "get_key_value_store", + "parsedDocstring": { + "text": "Get key-value store crawling context helper function." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 658 + } + ], + "type": { + "name": "GetKeyValueStoreFromRequestHandlerFunction", + "type": "reference", + "target": "474" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.get_key_value_store", + "target": 512, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Logger instance." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4320, + "module": "_types", + "name": "log", + "parsedDocstring": { + "text": "Logger instance." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 661 + } + ], + "type": { + "name": "logging.Logger", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.log", + "target": 513, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return hash of the context. Each context is considered unique." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4321, + "module": "_types", + "name": "__hash__", + "parsedDocstring": { + "text": "Return hash of the context. Each context is considered unique." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 668 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return hash of the context. Each context is considered unique." + } + ] + }, + "flags": {}, + "id": 517, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__hash__", + "parameters": [], + "type": { + "name": "int", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.__hash__", + "target": 516, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawlingContext.__hash__", + "target": 516, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a modified copy of the crawling context with specified changes." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4322, + "module": "_types", + "name": "create_modified_copy", + "parsedDocstring": { + "text": "Create a modified copy of the crawling context with specified changes." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 672 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a modified copy of the crawling context with specified changes." + } + ] + }, + "flags": {}, + "id": 519, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "create_modified_copy", + "parameters": [ + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 520, + "kind": 32768, + "kindString": "Parameter", + "name": "push_data", + "type": { + "name": "PushDataFunction | None", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 521, + "kind": 32768, + "kindString": "Parameter", + "name": "add_requests", + "type": { + "name": "AddRequestsFunction | None", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 522, + "kind": 32768, + "kindString": "Parameter", + "name": "get_key_value_store", + "type": { + "name": "GetKeyValueStoreFromRequestHandlerFunction | None", + "type": "reference" + } + } + ], + "type": { + "name": "Self", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.create_modified_copy", + "target": 518, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawlingContext.create_modified_copy", + "target": 518, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The crawling context used by `AbstractHttpCrawler`.\n\nIt provides access to key objects as well as utility functions for handling crawling tasks." + } + ] + }, + "decorations": [ + { + "args": "(frozen=True)", + "name": "dataclass" + }, + { + "args": "('Crawling contexts')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 4321, + 4322, + 4309, + 1256, + 4310 + ], + "title": "Methods" + }, + { + "children": [ + 4316, + 1254, + 1255, + 4319, + 4311, + 4320, + 1253, + 4314, + 4317, + 4312, + 4315, + 4313, + 4318 + ], + "title": "Properties" + } + ], + "id": 1252, + "module": "crawlers._abstract_http._http_crawling_context", + "name": "ParsedHttpCrawlingContext", + "parsedDocstring": { + "text": "The crawling context used by `AbstractHttpCrawler`.\n\nIt provides access to key objects as well as utility functions for handling crawling tasks." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 34 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "HttpCrawlingContext", + "target": "1245", + "type": "reference" + } + ], + "extendedBy": [ + { + "name": "AdaptivePlaywrightCrawlingContext", + "target": "1442", + "type": "reference" + }, + { + "name": "BeautifulSoupCrawlingContext", + "target": "1651", + "type": "reference" + }, + { + "name": "ParselCrawlingContext", + "target": "1716", + "type": "reference" + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1262, + "module": "crawlers._abstract_http._abstract_http_crawler", + "name": "TCrawlingContext", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 33 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1263, + "module": "crawlers._abstract_http._abstract_http_crawler", + "name": "TStatisticsState", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 34 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Timeout for the HTTP request." + } + ] + }, + "flags": {}, + "groups": [], + "id": 1265, + "module": "crawlers._abstract_http._abstract_http_crawler", + "name": "navigation_timeout", + "parsedDocstring": { + "text": "Timeout for the HTTP request." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 46 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "timedelta" + }, + { + "type": "literal", + "value": null + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A callable responsible for handling requests." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4189, + "module": "crawlers._basic._basic_crawler", + "name": "request_handler", + "parsedDocstring": { + "text": "A callable responsible for handling requests." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 224 + } + ], + "type": { + "name": "NotRequired[Callable[[TCrawlingContext], Awaitable[None]]]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptionsGeneric.request_handler", + "target": 1552, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A custom `Statistics` instance, allowing the use of non-default configuration." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4190, + "module": "crawlers._basic._basic_crawler", + "name": "statistics", + "parsedDocstring": { + "text": "A custom `Statistics` instance, allowing the use of non-default configuration." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 231 + } + ], + "type": { + "name": "NotRequired[Statistics[TStatisticsState]]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptionsGeneric.statistics", + "target": 1553, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The `Configuration` instance. Some of its properties are used as defaults for the crawler." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4191, + "module": "crawlers._basic._basic_crawler", + "name": "configuration", + "parsedDocstring": { + "text": "The `Configuration` instance. Some of its properties are used as defaults for the crawler." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 116 + } + ], + "type": { + "name": "NotRequired[Configuration]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.configuration", + "target": 1526, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The event manager for managing events for the crawler and all its components." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4192, + "module": "crawlers._basic._basic_crawler", + "name": "event_manager", + "parsedDocstring": { + "text": "The event manager for managing events for the crawler and all its components." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 119 + } + ], + "type": { + "name": "NotRequired[EventManager]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.event_manager", + "target": 1527, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The storage client for managing storages for the crawler and all its components." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4193, + "module": "crawlers._basic._basic_crawler", + "name": "storage_client", + "parsedDocstring": { + "text": "The storage client for managing storages for the crawler and all its components." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 122 + } + ], + "type": { + "name": "NotRequired[StorageClient]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.storage_client", + "target": 1528, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Manager of requests that should be processed by the crawler." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4194, + "module": "crawlers._basic._basic_crawler", + "name": "request_manager", + "parsedDocstring": { + "text": "Manager of requests that should be processed by the crawler." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 125 + } + ], + "type": { + "name": "NotRequired[RequestManager]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.request_manager", + "target": 1529, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A custom `SessionPool` instance, allowing the use of non-default configuration." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4195, + "module": "crawlers._basic._basic_crawler", + "name": "session_pool", + "parsedDocstring": { + "text": "A custom `SessionPool` instance, allowing the use of non-default configuration." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 128 + } + ], + "type": { + "name": "NotRequired[SessionPool]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.session_pool", + "target": 1530, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP proxy configuration used when making requests." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4196, + "module": "crawlers._basic._basic_crawler", + "name": "proxy_configuration", + "parsedDocstring": { + "text": "HTTP proxy configuration used when making requests." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 131 + } + ], + "type": { + "name": "NotRequired[ProxyConfiguration]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.proxy_configuration", + "target": 1531, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP client used by `BasicCrawlingContext.send_request` method." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4197, + "module": "crawlers._basic._basic_crawler", + "name": "http_client", + "parsedDocstring": { + "text": "HTTP client used by `BasicCrawlingContext.send_request` method." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 134 + } + ], + "type": { + "name": "NotRequired[HttpClient]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.http_client", + "target": 1532, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Specifies the maximum number of retries allowed for a request if its processing fails.\nThis includes retries due to navigation errors or errors thrown from user-supplied functions\n(`request_handler`, `pre_navigation_hooks` etc.).\n\nThis limit does not apply to retries triggered by session rotation (see `max_session_rotations`)." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4198, + "module": "crawlers._basic._basic_crawler", + "name": "max_request_retries", + "parsedDocstring": { + "text": "Specifies the maximum number of retries allowed for a request if its processing fails.\nThis includes retries due to navigation errors or errors thrown from user-supplied functions\n(`request_handler`, `pre_navigation_hooks` etc.).\n\nThis limit does not apply to retries triggered by session rotation (see `max_session_rotations`)." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 137 + } + ], + "type": { + "name": "NotRequired[int]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.max_request_retries", + "target": 1533, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum number of pages to open during a crawl. The crawl stops upon reaching this limit.\nSetting this value can help avoid infinite loops in misconfigured crawlers. `None` means no limit.\nDue to concurrency settings, the actual number of pages visited may slightly exceed this value." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4199, + "module": "crawlers._basic._basic_crawler", + "name": "max_requests_per_crawl", + "parsedDocstring": { + "text": "Maximum number of pages to open during a crawl. The crawl stops upon reaching this limit.\nSetting this value can help avoid infinite loops in misconfigured crawlers. `None` means no limit.\nDue to concurrency settings, the actual number of pages visited may slightly exceed this value." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 144 + } + ], + "type": { + "name": "NotRequired[int | None]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.max_requests_per_crawl", + "target": 1534, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum number of session rotations per request. The crawler rotates the session if a proxy error occurs\nor if the website blocks the request.\n\nThe session rotations are not counted towards the `max_request_retries` limit." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4200, + "module": "crawlers._basic._basic_crawler", + "name": "max_session_rotations", + "parsedDocstring": { + "text": "Maximum number of session rotations per request. The crawler rotates the session if a proxy error occurs\nor if the website blocks the request.\n\nThe session rotations are not counted towards the `max_request_retries` limit." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 149 + } + ], + "type": { + "name": "NotRequired[int]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.max_session_rotations", + "target": 1535, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Specifies the maximum crawl depth. If set, the crawler will stop processing links beyond this depth.\nThe crawl depth starts at 0 for initial requests and increases with each subsequent level of links.\nRequests at the maximum depth will still be processed, but no new links will be enqueued from those requests.\nIf not set, crawling continues without depth restrictions." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4201, + "module": "crawlers._basic._basic_crawler", + "name": "max_crawl_depth", + "parsedDocstring": { + "text": "Specifies the maximum crawl depth. If set, the crawler will stop processing links beyond this depth.\nThe crawl depth starts at 0 for initial requests and increases with each subsequent level of links.\nRequests at the maximum depth will still be processed, but no new links will be enqueued from those requests.\nIf not set, crawling continues without depth restrictions." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 156 + } + ], + "type": { + "name": "NotRequired[int | None]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.max_crawl_depth", + "target": 1536, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Enable the use of a session pool for managing sessions during crawling." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4202, + "module": "crawlers._basic._basic_crawler", + "name": "use_session_pool", + "parsedDocstring": { + "text": "Enable the use of a session pool for managing sessions during crawling." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 163 + } + ], + "type": { + "name": "NotRequired[bool]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.use_session_pool", + "target": 1537, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, the crawler attempts to bypass bot protections automatically." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4203, + "module": "crawlers._basic._basic_crawler", + "name": "retry_on_blocked", + "parsedDocstring": { + "text": "If True, the crawler attempts to bypass bot protections automatically." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 166 + } + ], + "type": { + "name": "NotRequired[bool]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.retry_on_blocked", + "target": 1538, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Settings to fine-tune concurrency levels." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4204, + "module": "crawlers._basic._basic_crawler", + "name": "concurrency_settings", + "parsedDocstring": { + "text": "Settings to fine-tune concurrency levels." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 169 + } + ], + "type": { + "name": "NotRequired[ConcurrencySettings]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.concurrency_settings", + "target": 1539, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum duration allowed for a single request handler to run." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4205, + "module": "crawlers._basic._basic_crawler", + "name": "request_handler_timeout", + "parsedDocstring": { + "text": "Maximum duration allowed for a single request handler to run." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 172 + } + ], + "type": { + "name": "NotRequired[timedelta]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.request_handler_timeout", + "target": 1540, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, the crawler stops immediately when any request handler error occurs." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4206, + "module": "crawlers._basic._basic_crawler", + "name": "abort_on_error", + "parsedDocstring": { + "text": "If True, the crawler stops immediately when any request handler error occurs." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 175 + } + ], + "type": { + "name": "NotRequired[bool]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.abort_on_error", + "target": 1541, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, the crawler will set up logging infrastructure automatically." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4207, + "module": "crawlers._basic._basic_crawler", + "name": "configure_logging", + "parsedDocstring": { + "text": "If True, the crawler will set up logging infrastructure automatically." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 178 + } + ], + "type": { + "name": "NotRequired[bool]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.configure_logging", + "target": 1542, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If 'table', displays crawler statistics as formatted tables in logs. If 'inline', outputs statistics as plain\ntext log messages." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4208, + "module": "crawlers._basic._basic_crawler", + "name": "statistics_log_format", + "parsedDocstring": { + "text": "If 'table', displays crawler statistics as formatted tables in logs. If 'inline', outputs statistics as plain\ntext log messages." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 181 + } + ], + "type": { + "name": "NotRequired[Literal['table', 'inline']]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.statistics_log_format", + "target": 1543, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Flag that can keep crawler running even when there are no requests in queue." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4209, + "module": "crawlers._basic._basic_crawler", + "name": "keep_alive", + "parsedDocstring": { + "text": "Flag that can keep crawler running even when there are no requests in queue." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 186 + } + ], + "type": { + "name": "NotRequired[bool]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.keep_alive", + "target": 1544, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Additional HTTP status codes to treat as errors, triggering automatic retries when encountered." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4210, + "module": "crawlers._basic._basic_crawler", + "name": "additional_http_error_status_codes", + "parsedDocstring": { + "text": "Additional HTTP status codes to treat as errors, triggering automatic retries when encountered." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 189 + } + ], + "type": { + "name": "NotRequired[Iterable[int]]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.additional_http_error_status_codes", + "target": 1545, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP status codes that are typically considered errors but should be treated as successful responses." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4211, + "module": "crawlers._basic._basic_crawler", + "name": "ignore_http_error_status_codes", + "parsedDocstring": { + "text": "HTTP status codes that are typically considered errors but should be treated as successful responses." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 192 + } + ], + "type": { + "name": "NotRequired[Iterable[int]]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.ignore_http_error_status_codes", + "target": 1546, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If set to `True`, the crawler will automatically try to fetch the robots.txt file for each domain,\nand skip those that are not allowed. This also prevents disallowed URLs to be added via `EnqueueLinksFunction`." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4212, + "module": "crawlers._basic._basic_crawler", + "name": "respect_robots_txt_file", + "parsedDocstring": { + "text": "If set to `True`, the crawler will automatically try to fetch the robots.txt file for each domain,\nand skip those that are not allowed. This also prevents disallowed URLs to be added via `EnqueueLinksFunction`." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 203 + } + ], + "type": { + "name": "NotRequired[bool]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.respect_robots_txt_file", + "target": 1547, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Interval for logging the crawler status messages." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4213, + "module": "crawlers._basic._basic_crawler", + "name": "status_message_logging_interval", + "parsedDocstring": { + "text": "Interval for logging the crawler status messages." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 207 + } + ], + "type": { + "name": "NotRequired[timedelta]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.status_message_logging_interval", + "target": 1548, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Allows overriding the default status message. The default status message is provided in the parameters.\nReturning `None` suppresses the status message." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4214, + "module": "crawlers._basic._basic_crawler", + "name": "status_message_callback", + "parsedDocstring": { + "text": "Allows overriding the default status message. The default status message is provided in the parameters.\nReturning `None` suppresses the status message." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 210 + } + ], + "type": { + "name": "NotRequired[ Callable[[StatisticsState, StatisticsState | None, str], Awaitable[str | None]] ]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.status_message_callback", + "target": 1549, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Identifier used for crawler state tracking. Use the same id across multiple crawlers to share state between\nthem." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4215, + "module": "crawlers._basic._basic_crawler", + "name": "id", + "parsedDocstring": { + "text": "Identifier used for crawler state tracking. Use the same id across multiple crawlers to share state between\nthem." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 216 + } + ], + "type": { + "name": "NotRequired[int]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.id", + "target": 1550, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Arguments for the `AbstractHttpCrawler` constructor.\n\nIt is intended for typing forwarded `__init__` arguments in the subclasses." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 4206, + 4210, + 4204, + 4191, + 4207, + 4192, + 4197, + 4215, + 4211, + 4209, + 4201, + 4198, + 4199, + 4200, + 1265, + 4196, + 4189, + 4205, + 4194, + 4212, + 4203, + 4195, + 4190, + 4208, + 4214, + 4213, + 4193, + 4202 + ], + "title": "Properties" + } + ], + "id": 1264, + "module": "crawlers._abstract_http._abstract_http_crawler", + "name": "HttpCrawlerOptions", + "parsedDocstring": { + "text": "Arguments for the `AbstractHttpCrawler` constructor.\n\nIt is intended for typing forwarded `__init__` arguments in the subclasses." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 37 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "BasicCrawlerOptions", + "target": "1554", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1267, + "module": "crawlers._basic._basic_crawler", + "name": "__init__", + "parsedDocstring": { + "text": "Initialize a new instance.\n", + "args": { + "configuration": "The `Configuration` instance. Some of its properties are used as defaults for the crawler.", + "event_manager": "The event manager for managing events for the crawler and all its components.", + "storage_client": "The storage client for managing storages for the crawler and all its components.", + "request_manager": "Manager of requests that should be processed by the crawler.", + "session_pool": "A custom `SessionPool` instance, allowing the use of non-default configuration.", + "proxy_configuration": "HTTP proxy configuration used when making requests.", + "http_client": "HTTP client used by `BasicCrawlingContext.send_request` method.", + "request_handler": "A callable responsible for handling requests.", + "max_request_retries": "Specifies the maximum number of retries allowed for a request if its processing fails.\nThis includes retries due to navigation errors or errors thrown from user-supplied functions\n(`request_handler`, `pre_navigation_hooks` etc.).\nThis limit does not apply to retries triggered by session rotation (see `max_session_rotations`).", + "max_requests_per_crawl": "Maximum number of pages to open during a crawl. The crawl stops upon reaching\nthis limit. Setting this value can help avoid infinite loops in misconfigured crawlers. `None` means\nno limit. Due to concurrency settings, the actual number of pages visited may slightly exceed\nthis value. If used together with `keep_alive`, then the crawler will be kept alive only until\n`max_requests_per_crawl` is achieved.", + "max_session_rotations": "Maximum number of session rotations per request. The crawler rotates the session\nif a proxy error occurs or if the website blocks the request.\nThe session rotations are not counted towards the `max_request_retries` limit.", + "max_crawl_depth": "Specifies the maximum crawl depth. If set, the crawler will stop processing links beyond\nthis depth. The crawl depth starts at 0 for initial requests and increases with each subsequent level\nof links. Requests at the maximum depth will still be processed, but no new links will be enqueued\nfrom those requests. If not set, crawling continues without depth restrictions.", + "use_session_pool": "Enable the use of a session pool for managing sessions during crawling.", + "retry_on_blocked": "If True, the crawler attempts to bypass bot protections automatically.", + "additional_http_error_status_codes": "Additional HTTP status codes to treat as errors,\ntriggering automatic retries when encountered.", + "ignore_http_error_status_codes": "HTTP status codes that are typically considered errors but should be treated\nas successful responses.", + "concurrency_settings": "Settings to fine-tune concurrency levels.", + "request_handler_timeout": "Maximum duration allowed for a single request handler to run.", + "statistics": "A custom `Statistics` instance, allowing the use of non-default configuration.", + "abort_on_error": "If True, the crawler stops immediately when any request handler error occurs.", + "keep_alive": "If True, it will keep crawler alive even if there are no requests in queue.\nUse `crawler.stop()` to exit the crawler.", + "configure_logging": "If True, the crawler will set up logging infrastructure automatically.", + "statistics_log_format": "If 'table', displays crawler statistics as formatted tables in logs. If 'inline',\noutputs statistics as plain text log messages.", + "respect_robots_txt_file": "If set to `True`, the crawler will automatically try to fetch the robots.txt file\nfor each domain, and skip those that are not allowed. This also prevents disallowed URLs to be added\nvia `EnqueueLinksFunction`", + "status_message_logging_interval": "Interval for logging the crawler status messages.", + "status_message_callback": "Allows overriding the default status message. The default status message is\nprovided in the parameters. Returning `None` suppresses the status message.", + "id": "Identifier used for crawler state tracking. Use the same id across multiple crawlers to share state\nbetween them.", + "_context_pipeline": "Enables extending the request lifecycle and modifying the crawling context.\nIntended for use by subclasses rather than direct instantiation of `BasicCrawler`.", + "_additional_context_managers": "Additional context managers used throughout the crawler lifecycle.\nIntended for use by subclasses rather than direct instantiation of `BasicCrawler`.", + "_logger": "A logger instance, typically provided by a subclass, for consistent logging labels.\nIntended for use by subclasses rather than direct instantiation of `BasicCrawler`." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 70 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n" + } + ] + }, + "flags": {}, + "id": 1557, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The `Configuration` instance. Some of its properties are used as defaults for the crawler." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1558, + "kind": 32768, + "kindString": "Parameter", + "name": "configuration", + "type": { + "name": "Configuration | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Configuration", + "target": "235" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The event manager for managing events for the crawler and all its components." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1559, + "kind": 32768, + "kindString": "Parameter", + "name": "event_manager", + "type": { + "name": "EventManager | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "EventManager", + "target": "1907" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The storage client for managing storages for the crawler and all its components." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1560, + "kind": 32768, + "kindString": "Parameter", + "name": "storage_client", + "type": { + "name": "StorageClient | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "StorageClient", + "target": "2784" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Manager of requests that should be processed by the crawler." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1561, + "kind": 32768, + "kindString": "Parameter", + "name": "request_manager", + "type": { + "name": "RequestManager | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "RequestManager", + "target": "2339" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A custom `SessionPool` instance, allowing the use of non-default configuration." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1562, + "kind": 32768, + "kindString": "Parameter", + "name": "session_pool", + "type": { + "name": "SessionPool | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "SessionPool", + "target": "2498" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP proxy configuration used when making requests." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1563, + "kind": 32768, + "kindString": "Parameter", + "name": "proxy_configuration", + "type": { + "name": "ProxyConfiguration | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ProxyConfiguration", + "target": "263" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP client used by `BasicCrawlingContext.send_request` method." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1564, + "kind": 32768, + "kindString": "Parameter", + "name": "http_client", + "type": { + "name": "HttpClient | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "HttpClient", + "target": "2070" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A callable responsible for handling requests." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1565, + "kind": 32768, + "kindString": "Parameter", + "name": "request_handler", + "type": { + "name": "Callable[[TCrawlingContext], Awaitable[None]] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Callable", + "typeArguments": [ + { + "type": "reference", + "name": "[TCrawlingContext]" + }, + { + "type": "reference", + "name": "Awaitable", + "typeArguments": [ + { + "type": "literal", + "value": null + } + ] + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Specifies the maximum number of retries allowed for a request if its processing fails.\nThis includes retries due to navigation errors or errors thrown from user-supplied functions\n(`request_handler`, `pre_navigation_hooks` etc.).\nThis limit does not apply to retries triggered by session rotation (see `max_session_rotations`)." + } + ] + }, + "defaultValue": "3", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1566, + "kind": 32768, + "kindString": "Parameter", + "name": "max_request_retries", + "type": { + "name": "int", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum number of pages to open during a crawl. The crawl stops upon reaching\nthis limit. Setting this value can help avoid infinite loops in misconfigured crawlers. `None` means\nno limit. Due to concurrency settings, the actual number of pages visited may slightly exceed\nthis value. If used together with `keep_alive`, then the crawler will be kept alive only until\n`max_requests_per_crawl` is achieved." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1567, + "kind": 32768, + "kindString": "Parameter", + "name": "max_requests_per_crawl", + "type": { + "name": "int | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum number of session rotations per request. The crawler rotates the session\nif a proxy error occurs or if the website blocks the request.\nThe session rotations are not counted towards the `max_request_retries` limit." + } + ] + }, + "defaultValue": "10", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1568, + "kind": 32768, + "kindString": "Parameter", + "name": "max_session_rotations", + "type": { + "name": "int", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Specifies the maximum crawl depth. If set, the crawler will stop processing links beyond\nthis depth. The crawl depth starts at 0 for initial requests and increases with each subsequent level\nof links. Requests at the maximum depth will still be processed, but no new links will be enqueued\nfrom those requests. If not set, crawling continues without depth restrictions." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1569, + "kind": 32768, + "kindString": "Parameter", + "name": "max_crawl_depth", + "type": { + "name": "int | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Enable the use of a session pool for managing sessions during crawling." + } + ] + }, + "defaultValue": "True", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1570, + "kind": 32768, + "kindString": "Parameter", + "name": "use_session_pool", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, the crawler attempts to bypass bot protections automatically." + } + ] + }, + "defaultValue": "True", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1571, + "kind": 32768, + "kindString": "Parameter", + "name": "retry_on_blocked", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Additional HTTP status codes to treat as errors,\ntriggering automatic retries when encountered." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1572, + "kind": 32768, + "kindString": "Parameter", + "name": "additional_http_error_status_codes", + "type": { + "name": "Iterable[int] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Iterable", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP status codes that are typically considered errors but should be treated\nas successful responses." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1573, + "kind": 32768, + "kindString": "Parameter", + "name": "ignore_http_error_status_codes", + "type": { + "name": "Iterable[int] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Iterable", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Settings to fine-tune concurrency levels." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1574, + "kind": 32768, + "kindString": "Parameter", + "name": "concurrency_settings", + "type": { + "name": "ConcurrencySettings | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ConcurrencySettings", + "target": "328" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum duration allowed for a single request handler to run." + } + ] + }, + "defaultValue": "timedelta(minutes=1)", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1575, + "kind": 32768, + "kindString": "Parameter", + "name": "request_handler_timeout", + "type": { + "name": "timedelta", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A custom `Statistics` instance, allowing the use of non-default configuration." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1576, + "kind": 32768, + "kindString": "Parameter", + "name": "statistics", + "type": { + "name": "Statistics[TStatisticsState] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Statistics", + "typeArguments": [ + { + "type": "reference", + "name": "TStatisticsState", + "target": "1263" + } + ], + "target": "2707" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, the crawler stops immediately when any request handler error occurs." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1577, + "kind": 32768, + "kindString": "Parameter", + "name": "abort_on_error", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, it will keep crawler alive even if there are no requests in queue.\nUse `crawler.stop()` to exit the crawler." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1578, + "kind": 32768, + "kindString": "Parameter", + "name": "keep_alive", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, the crawler will set up logging infrastructure automatically." + } + ] + }, + "defaultValue": "True", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1579, + "kind": 32768, + "kindString": "Parameter", + "name": "configure_logging", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "If 'table', displays crawler statistics as formatted tables in logs. If 'inline',\noutputs statistics as plain text log messages." + } + ] + }, + "defaultValue": "'table'", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1580, + "kind": 32768, + "kindString": "Parameter", + "name": "statistics_log_format", + "type": { + "name": "Literal", + "type": "reference", + "typeArguments": [ + { + "type": "literal", + "value": "table" + }, + { + "type": "literal", + "value": "inline" + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "If set to `True`, the crawler will automatically try to fetch the robots.txt file\nfor each domain, and skip those that are not allowed. This also prevents disallowed URLs to be added\nvia `EnqueueLinksFunction`" + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1581, + "kind": 32768, + "kindString": "Parameter", + "name": "respect_robots_txt_file", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Interval for logging the crawler status messages." + } + ] + }, + "defaultValue": "timedelta(seconds=10)", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1582, + "kind": 32768, + "kindString": "Parameter", + "name": "status_message_logging_interval", + "type": { + "name": "timedelta", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Allows overriding the default status message. The default status message is\nprovided in the parameters. Returning `None` suppresses the status message." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1583, + "kind": 32768, + "kindString": "Parameter", + "name": "status_message_callback", + "type": { + "name": "Callable[[StatisticsState, StatisticsState | None, str], Awaitable[str | None]] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Callable", + "typeArguments": [ + { + "type": "reference", + "name": "[StatisticsState, StatisticsState | None, str]" + }, + { + "type": "reference", + "name": "Awaitable", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + ] + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Identifier used for crawler state tracking. Use the same id across multiple crawlers to share state\nbetween them." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1584, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "int | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Enables extending the request lifecycle and modifying the crawling context.\nIntended for use by subclasses rather than direct instantiation of `BasicCrawler`." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1585, + "kind": 32768, + "kindString": "Parameter", + "name": "_context_pipeline", + "type": { + "name": "ContextPipeline[TCrawlingContext] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ContextPipeline", + "typeArguments": [ + { + "type": "reference", + "name": "TCrawlingContext", + "target": "41" + } + ], + "target": "1495" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Additional context managers used throughout the crawler lifecycle.\nIntended for use by subclasses rather than direct instantiation of `BasicCrawler`." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1586, + "kind": 32768, + "kindString": "Parameter", + "name": "_additional_context_managers", + "type": { + "name": "Sequence[AbstractAsyncContextManager] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Sequence", + "typeArguments": [ + { + "type": "reference", + "name": "AbstractAsyncContextManager" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A logger instance, typically provided by a subclass, for consistent logging labels.\nIntended for use by subclasses rather than direct instantiation of `BasicCrawler`." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1587, + "kind": 32768, + "kindString": "Parameter", + "name": "_logger", + "type": { + "name": "logging.Logger | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "logging.Logger" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "BasicCrawler.__init__", + "target": 1556, + "type": "reference" + } + } + ], + "overwrites": { + "name": "BasicCrawler.__init__", + "target": 1556, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a specific version of `AbstractHttpCrawler` class.\n\nThis is a convenience factory method for creating a specific `AbstractHttpCrawler` subclass.\nWhile `AbstractHttpCrawler` allows its two generic parameters to be independent,\nthis method simplifies cases where `TParseResult` is used for both generic parameters." + } + ] + }, + "decorations": [ + { + "name": "classmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1272, + "module": "crawlers._abstract_http._abstract_http_crawler", + "name": "create_parsed_http_crawler_class", + "parsedDocstring": { + "text": "Create a specific version of `AbstractHttpCrawler` class.\n\nThis is a convenience factory method for creating a specific `AbstractHttpCrawler` subclass.\nWhile `AbstractHttpCrawler` allows its two generic parameters to be independent,\nthis method simplifies cases where `TParseResult` is used for both generic parameters." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 93 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a specific version of `AbstractHttpCrawler` class.\n\nThis is a convenience factory method for creating a specific `AbstractHttpCrawler` subclass.\nWhile `AbstractHttpCrawler` allows its two generic parameters to be independent,\nthis method simplifies cases where `TParseResult` is used for both generic parameters." + } + ] + }, + "flags": {}, + "id": 1273, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "create_parsed_http_crawler_class", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1274, + "kind": 32768, + "kindString": "Parameter", + "name": "static_parser", + "type": { + "name": "AbstractHttpParser", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "TParseResult", + "target": "1243" + }, + { + "type": "reference", + "name": "TSelectResult", + "target": "1244" + } + ], + "target": "1281" + } + } + ], + "type": { + "name": "type", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "AbstractHttpCrawler", + "typeArguments": [ + { + "type": "reference", + "name": "ParsedHttpCrawlingContext", + "typeArguments": [ + { + "type": "reference", + "name": "TParseResult", + "target": "1243" + } + ], + "target": "1252" + }, + { + "type": "reference", + "name": "TParseResult", + "target": "1243" + }, + { + "type": "reference", + "name": "TSelectResult", + "target": "1244" + } + ], + "target": "1266" + } + ], + "target": "981" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a hook to be called before each navigation.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1275, + "module": "crawlers._abstract_http._abstract_http_crawler", + "name": "pre_navigation_hook", + "parsedDocstring": { + "text": "Register a hook to be called before each navigation.\n", + "args": { + "hook": "A coroutine function to be called before each navigation." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 317 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a hook to be called before each navigation.\n" + } + ] + }, + "flags": {}, + "id": 1276, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "pre_navigation_hook", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A coroutine function to be called before each navigation." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1277, + "kind": 32768, + "kindString": "Parameter", + "name": "hook", + "type": { + "name": "Callable", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "[BasicCrawlingContext]" + }, + { + "type": "reference", + "name": "Awaitable", + "typeArguments": [ + { + "type": "literal", + "value": null + } + ] + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a hook to be called after each navigation.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1278, + "module": "crawlers._abstract_http._abstract_http_crawler", + "name": "post_navigation_hook", + "parsedDocstring": { + "text": "Register a hook to be called after each navigation.\n", + "args": { + "hook": "A coroutine function to be called after each navigation." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 325 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a hook to be called after each navigation.\n" + } + ] + }, + "flags": {}, + "id": 1279, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "post_navigation_hook", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A coroutine function to be called after each navigation." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1280, + "kind": 32768, + "kindString": "Parameter", + "name": "hook", + "type": { + "name": "Callable", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "[HttpCrawlingContext]" + }, + { + "type": "reference", + "name": "Awaitable", + "typeArguments": [ + { + "type": "literal", + "value": null + } + ] + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The logger used by the crawler." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 4090, + "module": "crawlers._basic._basic_crawler", + "name": "log", + "parsedDocstring": { + "text": "The logger used by the crawler." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 511 + } + ], + "type": { + "name": "logging.Logger", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.log", + "target": 1588, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The `Router` used to handle each individual crawling request." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 4091, + "module": "crawlers._basic._basic_crawler", + "name": "router", + "parsedDocstring": { + "text": "The `Router` used to handle each individual crawling request." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 516 + } + ], + "type": { + "name": "Router[TCrawlingContext]", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.router", + "target": 1589, + "type": "reference" + }, + "overwrites": { + "name": "BasicCrawler.router", + "target": 1590, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Statistics about the current (or last) crawler run." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 4092, + "module": "crawlers._basic._basic_crawler", + "name": "statistics", + "parsedDocstring": { + "text": "Statistics about the current (or last) crawler run." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 531 + } + ], + "type": { + "name": "Statistics[TStatisticsState]", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.statistics", + "target": 1593, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Set flag to stop crawler.\n\nThis stops current crawler run regardless of whether all requests were finished.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4093, + "module": "crawlers._basic._basic_crawler", + "name": "stop", + "parsedDocstring": { + "text": "Set flag to stop crawler.\n\nThis stops current crawler run regardless of whether all requests were finished.\n", + "args": { + "reason": "Reason for stopping that will be used in logs." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 535 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Set flag to stop crawler.\n\nThis stops current crawler run regardless of whether all requests were finished.\n" + } + ] + }, + "flags": {}, + "id": 1595, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "stop", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Reason for stopping that will be used in logs." + } + ] + }, + "defaultValue": "'Stop was called externally.'", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1596, + "kind": 32768, + "kindString": "Parameter", + "name": "reason", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.stop", + "target": 1594, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.stop", + "target": 1594, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the configured request manager. If none is configured, open and return the default request queue." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4094, + "module": "crawlers._basic._basic_crawler", + "name": "get_request_manager", + "parsedDocstring": { + "text": "Return the configured request manager. If none is configured, open and return the default request queue." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 613 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the configured request manager. If none is configured, open and return the default request queue." + } + ] + }, + "flags": {}, + "id": 1598, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_request_manager", + "parameters": [], + "type": { + "name": "RequestManager", + "type": "reference", + "target": "2339" + }, + "inheritedFrom": { + "name": "BasicCrawler.get_request_manager", + "target": 1597, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.get_request_manager", + "target": 1597, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the `Dataset` with the given ID or name. If none is provided, return the default one." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4095, + "module": "crawlers._basic._basic_crawler", + "name": "get_dataset", + "parsedDocstring": { + "text": "Return the `Dataset` with the given ID or name. If none is provided, return the default one." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 623 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the `Dataset` with the given ID or name. If none is provided, return the default one." + } + ] + }, + "flags": {}, + "id": 1600, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_dataset", + "parameters": [ + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1601, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1602, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1603, + "kind": 32768, + "kindString": "Parameter", + "name": "alias", + "type": { + "name": "str | None", + "type": "reference" + } + } + ], + "type": { + "name": "Dataset", + "type": "reference", + "target": "3766" + }, + "inheritedFrom": { + "name": "BasicCrawler.get_dataset", + "target": 1599, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.get_dataset", + "target": 1599, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the `KeyValueStore` with the given ID or name. If none is provided, return the default KVS." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4096, + "module": "crawlers._basic._basic_crawler", + "name": "get_key_value_store", + "parsedDocstring": { + "text": "Return the `KeyValueStore` with the given ID or name. If none is provided, return the default KVS." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 639 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the `KeyValueStore` with the given ID or name. If none is provided, return the default KVS." + } + ] + }, + "flags": {}, + "id": 1605, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_key_value_store", + "parameters": [ + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1606, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1607, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1608, + "kind": 32768, + "kindString": "Parameter", + "name": "alias", + "type": { + "name": "str | None", + "type": "reference" + } + } + ], + "type": { + "name": "KeyValueStore", + "type": "reference", + "target": "3700" + }, + "inheritedFrom": { + "name": "BasicCrawler.get_key_value_store", + "target": 1604, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.get_key_value_store", + "target": 1604, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a function to handle errors occurring in request handlers.\n\nThe error handler is invoked after a request handler error occurs and before a retry attempt." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4097, + "module": "crawlers._basic._basic_crawler", + "name": "error_handler", + "parsedDocstring": { + "text": "Register a function to handle errors occurring in request handlers.\n\nThe error handler is invoked after a request handler error occurs and before a retry attempt." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 655 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a function to handle errors occurring in request handlers.\n\nThe error handler is invoked after a request handler error occurs and before a retry attempt." + } + ] + }, + "flags": {}, + "id": 1610, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "error_handler", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1611, + "kind": 32768, + "kindString": "Parameter", + "name": "handler", + "type": { + "name": "ErrorHandler[TCrawlingContext | BasicCrawlingContext]", + "type": "reference" + } + } + ], + "type": { + "name": "ErrorHandler[TCrawlingContext]", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.error_handler", + "target": 1609, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.error_handler", + "target": 1609, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a function to handle requests that exceed the maximum retry limit.\n\nThe failed request handler is invoked when a request has failed all retry attempts." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4098, + "module": "crawlers._basic._basic_crawler", + "name": "failed_request_handler", + "parsedDocstring": { + "text": "Register a function to handle requests that exceed the maximum retry limit.\n\nThe failed request handler is invoked when a request has failed all retry attempts." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 665 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a function to handle requests that exceed the maximum retry limit.\n\nThe failed request handler is invoked when a request has failed all retry attempts." + } + ] + }, + "flags": {}, + "id": 1613, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "failed_request_handler", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1614, + "kind": 32768, + "kindString": "Parameter", + "name": "handler", + "type": { + "name": "FailedRequestHandler[TCrawlingContext | BasicCrawlingContext]", + "type": "reference" + } + } + ], + "type": { + "name": "FailedRequestHandler[TCrawlingContext]", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.failed_request_handler", + "target": 1612, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.failed_request_handler", + "target": 1612, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a function to handle skipped requests.\n\nThe skipped request handler is invoked when a request is skipped due to a collision or other reasons." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4099, + "module": "crawlers._basic._basic_crawler", + "name": "on_skipped_request", + "parsedDocstring": { + "text": "Register a function to handle skipped requests.\n\nThe skipped request handler is invoked when a request is skipped due to a collision or other reasons." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 675 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a function to handle skipped requests.\n\nThe skipped request handler is invoked when a request is skipped due to a collision or other reasons." + } + ] + }, + "flags": {}, + "id": 1616, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "on_skipped_request", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1617, + "kind": 32768, + "kindString": "Parameter", + "name": "callback", + "type": { + "name": "SkippedRequestCallback", + "type": "reference", + "target": "1524" + } + } + ], + "type": { + "name": "SkippedRequestCallback", + "type": "reference", + "target": "1524" + }, + "inheritedFrom": { + "name": "BasicCrawler.on_skipped_request", + "target": 1615, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.on_skipped_request", + "target": 1615, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Run the crawler until all requests are processed.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4100, + "module": "crawlers._basic._basic_crawler", + "name": "run", + "parsedDocstring": { + "text": "Run the crawler until all requests are processed.\n", + "args": { + "requests": "The requests to be enqueued before the crawler starts.", + "purge_request_queue": "If this is `True` and the crawler is not being run for the first time, the default\nrequest queue will be purged." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 683 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Run the crawler until all requests are processed.\n" + } + ] + }, + "flags": {}, + "id": 1619, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "run", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The requests to be enqueued before the crawler starts." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1620, + "kind": 32768, + "kindString": "Parameter", + "name": "requests", + "type": { + "name": "Sequence[str | Request] | None", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "If this is `True` and the crawler is not being run for the first time, the default\nrequest queue will be purged." + } + ] + }, + "defaultValue": "True", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1621, + "kind": 32768, + "kindString": "Parameter", + "name": "purge_request_queue", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "FinalStatistics", + "type": "reference", + "target": "2649" + }, + "inheritedFrom": { + "name": "BasicCrawler.run", + "target": 1618, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.run", + "target": 1618, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Add requests to the underlying request manager in batches.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4101, + "module": "crawlers._basic._basic_crawler", + "name": "add_requests", + "parsedDocstring": { + "text": "Add requests to the underlying request manager in batches.\n", + "args": { + "requests": "A list of requests to add to the queue.", + "forefront": "If True, add requests to the forefront of the queue.", + "batch_size": "The number of requests to add in one batch.", + "wait_time_between_batches": "Time to wait between adding batches.", + "wait_for_all_requests_to_be_added": "If True, wait for all requests to be added before returning.", + "wait_for_all_requests_to_be_added_timeout": "Timeout for waiting for all requests to be added." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 795 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Add requests to the underlying request manager in batches.\n" + } + ] + }, + "flags": {}, + "id": 1623, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "add_requests", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A list of requests to add to the queue." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1624, + "kind": 32768, + "kindString": "Parameter", + "name": "requests", + "type": { + "name": "Sequence[str | Request]", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, add requests to the forefront of the queue." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1625, + "kind": 32768, + "kindString": "Parameter", + "name": "forefront", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The number of requests to add in one batch." + } + ] + }, + "defaultValue": "1000", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1626, + "kind": 32768, + "kindString": "Parameter", + "name": "batch_size", + "type": { + "name": "int", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Time to wait between adding batches." + } + ] + }, + "defaultValue": "timedelta(0)", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1627, + "kind": 32768, + "kindString": "Parameter", + "name": "wait_time_between_batches", + "type": { + "name": "timedelta", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, wait for all requests to be added before returning." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1628, + "kind": 32768, + "kindString": "Parameter", + "name": "wait_for_all_requests_to_be_added", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Timeout for waiting for all requests to be added." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1629, + "kind": 32768, + "kindString": "Parameter", + "name": "wait_for_all_requests_to_be_added_timeout", + "type": { + "name": "timedelta | None", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.add_requests", + "target": 1622, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.add_requests", + "target": 1622, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4102, + "module": "crawlers._basic._basic_crawler", + "name": "use_state", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 843 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 1631, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "use_state", + "parameters": [ + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1632, + "kind": 32768, + "kindString": "Parameter", + "name": "default_value", + "type": { + "name": "dict[str, JsonSerializable] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "dict", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "JsonSerializable" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "dict", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "JsonSerializable" + } + ] + }, + "inheritedFrom": { + "name": "BasicCrawler.use_state", + "target": 1630, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.use_state", + "target": 1630, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Retrieve data from a `Dataset`.\n\nThis helper method simplifies the process of retrieving data from a `Dataset`. It opens the specified\none and then retrieves the data based on the provided parameters.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4103, + "module": "crawlers._basic._basic_crawler", + "name": "get_data", + "parsedDocstring": { + "text": "Retrieve data from a `Dataset`.\n\nThis helper method simplifies the process of retrieving data from a `Dataset`. It opens the specified\none and then retrieves the data based on the provided parameters.\n", + "args": { + "dataset_id": "The ID of the `Dataset`.", + "dataset_name": "The name of the `Dataset` (global scope, named storage).", + "dataset_alias": "The alias of the `Dataset` (run scope, unnamed storage).", + "kwargs": "Keyword arguments to be passed to the `Dataset.get_data()` method.\n" + }, + "returns": "The retrieved data." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 854 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "The retrieved data." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Retrieve data from a `Dataset`.\n\nThis helper method simplifies the process of retrieving data from a `Dataset`. It opens the specified\none and then retrieves the data based on the provided parameters.\n" + } + ] + }, + "flags": {}, + "id": 1634, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_data", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The ID of the `Dataset`." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1635, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_id", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The name of the `Dataset` (global scope, named storage)." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1636, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_name", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The alias of the `Dataset` (run scope, unnamed storage)." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1637, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_alias", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Keyword arguments to be passed to the `Dataset.get_data()` method.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1638, + "kind": 32768, + "kindString": "Parameter", + "name": "kwargs", + "type": { + "name": "Unpack[GetDataKwargs]", + "type": "reference" + } + } + ], + "type": { + "name": "DatasetItemsListPage", + "type": "reference", + "target": "3658" + }, + "inheritedFrom": { + "name": "BasicCrawler.get_data", + "target": 1633, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.get_data", + "target": 1633, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Export all items from a Dataset to a JSON or CSV file.\n\nThis method simplifies the process of exporting data collected during crawling. It automatically\ndetermines the export format based on the file extension (`.json` or `.csv`) and handles\nthe conversion of `Dataset` items to the appropriate format.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4104, + "module": "crawlers._basic._basic_crawler", + "name": "export_data", + "parsedDocstring": { + "text": "Export all items from a Dataset to a JSON or CSV file.\n\nThis method simplifies the process of exporting data collected during crawling. It automatically\ndetermines the export format based on the file extension (`.json` or `.csv`) and handles\nthe conversion of `Dataset` items to the appropriate format.\n", + "args": { + "path": "The destination file path. Must end with '.json' or '.csv'.", + "dataset_id": "The ID of the Dataset to export from.", + "dataset_name": "The name of the Dataset to export from (global scope, named storage).", + "dataset_alias": "The alias of the Dataset to export from (run scope, unnamed storage).", + "additional_kwargs": "Extra keyword arguments forwarded to the JSON/CSV exporter depending on the file format." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 884 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Export all items from a Dataset to a JSON or CSV file.\n\nThis method simplifies the process of exporting data collected during crawling. It automatically\ndetermines the export format based on the file extension (`.json` or `.csv`) and handles\nthe conversion of `Dataset` items to the appropriate format.\n" + } + ] + }, + "flags": {}, + "id": 1640, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "export_data", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The destination file path. Must end with '.json' or '.csv'." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1641, + "kind": 32768, + "kindString": "Parameter", + "name": "path", + "type": { + "name": "str | Path", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The ID of the Dataset to export from." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1642, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_id", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The name of the Dataset to export from (global scope, named storage)." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1643, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_name", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The alias of the Dataset to export from (run scope, unnamed storage)." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1644, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_alias", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Extra keyword arguments forwarded to the JSON/CSV exporter depending on the file format." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1645, + "kind": 32768, + "kindString": "Parameter", + "name": "additional_kwargs", + "type": { + "name": "Unpack[ExportDataJsonKwargs | ExportDataCsvKwargs]", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.export_data", + "target": 1639, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.export_data", + "target": 1639, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A web crawler for performing HTTP requests.\n\nThe `AbstractHttpCrawler` builds on top of the `BasicCrawler`, inheriting all its features. Additionally,\nit implements HTTP communication using HTTP clients. The class allows integration with any HTTP client\nthat implements the `HttpClient` interface, provided as an input parameter to the constructor.\n\n`AbstractHttpCrawler` is a generic class intended to be used with a specific parser for parsing HTTP responses\nand the expected type of `TCrawlingContext` available to the user function. Examples of specific versions include\n`BeautifulSoupCrawler`, `ParselCrawler`, and `HttpCrawler`.\n\nHTTP client-based crawlers are ideal for websites that do not require JavaScript execution. For websites that\nrequire client-side JavaScript execution, consider using a browser-based crawler like the `PlaywrightCrawler`." + } + ] + }, + "decorations": [ + { + "args": "('Crawlers')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 1267, + 4101, + 1272, + 4097, + 4104, + 4098, + 4103, + 4095, + 4096, + 4094, + 4099, + 1278, + 1275, + 4100, + 4093, + 4102 + ], + "title": "Methods" + }, + { + "children": [ + 4090, + 4091, + 4092 + ], + "title": "Properties" + } + ], + "id": 1266, + "module": "crawlers._abstract_http._abstract_http_crawler", + "name": "AbstractHttpCrawler", + "parsedDocstring": { + "text": "A web crawler for performing HTTP requests.\n\nThe `AbstractHttpCrawler` builds on top of the `BasicCrawler`, inheriting all its features. Additionally,\nit implements HTTP communication using HTTP clients. The class allows integration with any HTTP client\nthat implements the `HttpClient` interface, provided as an input parameter to the constructor.\n\n`AbstractHttpCrawler` is a generic class intended to be used with a specific parser for parsing HTTP responses\nand the expected type of `TCrawlingContext` available to the user function. Examples of specific versions include\n`BeautifulSoupCrawler`, `ParselCrawler`, and `HttpCrawler`.\n\nHTTP client-based crawlers are ideal for websites that do not require JavaScript execution. For websites that\nrequire client-side JavaScript execution, consider using a browser-based crawler like the `PlaywrightCrawler`." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 51 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "BasicCrawler", + "target": "1555", + "type": "reference" + } + ], + "extendedBy": [ + { + "name": "BeautifulSoupCrawler", + "target": "1646", + "type": "reference" + }, + { + "name": "HttpCrawler", + "target": "1685", + "type": "reference" + }, + { + "name": "ParselCrawler", + "target": "1712", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Parse HTTP response.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1282, + "module": "crawlers._abstract_http._abstract_http_parser", + "name": "parse", + "parsedDocstring": { + "text": "Parse HTTP response.\n", + "args": { + "response": "HTTP response to be parsed.\n" + }, + "returns": "Parsed HTTP response." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_abstract_http/_abstract_http_parser.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 23 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "Parsed HTTP response." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Parse HTTP response.\n" + } + ] + }, + "flags": {}, + "id": 1283, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "parse", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP response to be parsed.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1284, + "kind": 32768, + "kindString": "Parameter", + "name": "response", + "type": { + "name": "HttpResponse", + "type": "reference", + "target": "2060" + } + } + ], + "type": { + "name": "TParseResult", + "type": "reference", + "target": "1243" + }, + "overwrites": { + "name": "AbstractHttpParser.parse", + "target": 1282, + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Parse text containing html.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1285, + "module": "crawlers._abstract_http._abstract_http_parser", + "name": "parse_text", + "parsedDocstring": { + "text": "Parse text containing html.\n", + "args": { + "text": "String containing html.\n" + }, + "returns": "Parsed text." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_abstract_http/_abstract_http_parser.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 34 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "Parsed text." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Parse text containing html.\n" + } + ] + }, + "flags": {}, + "id": 1286, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "parse_text", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "String containing html.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1287, + "kind": 32768, + "kindString": "Parameter", + "name": "text", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "TParseResult", + "type": "reference", + "target": "1243" + }, + "overwrites": { + "name": "AbstractHttpParser.parse_text", + "target": 1285, + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Use css selector to select page element and return it.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1288, + "module": "crawlers._abstract_http._abstract_http_parser", + "name": "select", + "parsedDocstring": { + "text": "Use css selector to select page element and return it.\n", + "args": { + "parsed_content": "Content where the page element will be located.", + "selector": "Css selector used to locate desired html element.\n" + }, + "returns": "Selected element." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_abstract_http/_abstract_http_parser.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 45 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "Selected element." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Use css selector to select page element and return it.\n" + } + ] + }, + "flags": {}, + "id": 1289, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "select", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Content where the page element will be located." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1290, + "kind": 32768, + "kindString": "Parameter", + "name": "parsed_content", + "type": { + "name": "TParseResult", + "type": "reference", + "target": "1243" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Css selector used to locate desired html element.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1291, + "kind": 32768, + "kindString": "Parameter", + "name": "selector", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "Sequence", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "TSelectResult", + "target": "1244" + } + ] + }, + "overwrites": { + "name": "AbstractHttpParser.select", + "target": 1288, + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Detect if blocked and return BlockedInfo with additional information.\n\nDefault implementation that expects `is_matching_selector` abstract method to be implemented.\nOverride this method if your parser has different way of blockage detection.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1292, + "module": "crawlers._abstract_http._abstract_http_parser", + "name": "is_blocked", + "parsedDocstring": { + "text": "Detect if blocked and return BlockedInfo with additional information.\n\nDefault implementation that expects `is_matching_selector` abstract method to be implemented.\nOverride this method if your parser has different way of blockage detection.\n", + "args": { + "parsed_content": "Parsed HTTP response. Result of `parse` method.\n" + }, + "returns": "`BlockedInfo` object that contains non-empty string description of reason if blockage was detected. Empty\nstring in reason signifies no blockage detected." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_abstract_http/_abstract_http_parser.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 56 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "`BlockedInfo` object that contains non-empty string description of reason if blockage was detected. Empty\nstring in reason signifies no blockage detected." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Detect if blocked and return BlockedInfo with additional information.\n\nDefault implementation that expects `is_matching_selector` abstract method to be implemented.\nOverride this method if your parser has different way of blockage detection.\n" + } + ] + }, + "flags": {}, + "id": 1293, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "is_blocked", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Parsed HTTP response. Result of `parse` method.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1294, + "kind": 32768, + "kindString": "Parameter", + "name": "parsed_content", + "type": { + "name": "TParseResult", + "type": "reference", + "target": "1243" + } + } + ], + "type": { + "name": "BlockedInfo", + "type": "reference", + "target": "1848" + }, + "overwrites": { + "name": "AbstractHttpParser.is_blocked", + "target": 1292, + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Find if selector has match in parsed content.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1295, + "module": "crawlers._abstract_http._abstract_http_parser", + "name": "is_matching_selector", + "parsedDocstring": { + "text": "Find if selector has match in parsed content.\n", + "args": { + "parsed_content": "Parsed HTTP response. Result of `parse` method.", + "selector": "String used to define matching pattern.\n" + }, + "returns": "True if selector has match in parsed content." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_abstract_http/_abstract_http_parser.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 84 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "True if selector has match in parsed content." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Find if selector has match in parsed content.\n" + } + ] + }, + "flags": {}, + "id": 1296, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "is_matching_selector", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Parsed HTTP response. Result of `parse` method." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1297, + "kind": 32768, + "kindString": "Parameter", + "name": "parsed_content", + "type": { + "name": "TParseResult", + "type": "reference", + "target": "1243" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "String used to define matching pattern.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1298, + "kind": 32768, + "kindString": "Parameter", + "name": "selector", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "bool", + "type": "reference" + }, + "overwrites": { + "name": "AbstractHttpParser.is_matching_selector", + "target": 1295, + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Find all links in result using selector.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1299, + "module": "crawlers._abstract_http._abstract_http_parser", + "name": "find_links", + "parsedDocstring": { + "text": "Find all links in result using selector.\n", + "args": { + "parsed_content": "Parsed HTTP response. Result of `parse` method.", + "selector": "String used to define matching pattern for finding links.", + "attribute": "Which node attribute to extract the links from.\n" + }, + "returns": "Iterable of strings that contain found links." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_abstract_http/_abstract_http_parser.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 96 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "Iterable of strings that contain found links." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Find all links in result using selector.\n" + } + ] + }, + "flags": {}, + "id": 1300, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "find_links", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Parsed HTTP response. Result of `parse` method." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1301, + "kind": 32768, + "kindString": "Parameter", + "name": "parsed_content", + "type": { + "name": "TParseResult", + "type": "reference", + "target": "1243" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "String used to define matching pattern for finding links." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1302, + "kind": 32768, + "kindString": "Parameter", + "name": "selector", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Which node attribute to extract the links from.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1303, + "kind": 32768, + "kindString": "Parameter", + "name": "attribute", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "Iterable", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + }, + "overwrites": { + "name": "AbstractHttpParser.find_links", + "target": 1299, + "type": "reference" + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Parser used for parsing HTTP response and inspecting parsed result to find links or detect blocking." + } + ] + }, + "decorations": [ + { + "args": "('HTTP parsers')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 1299, + 1292, + 1295, + 1282, + 1285, + 1288 + ], + "title": "Methods" + } + ], + "id": 1281, + "module": "crawlers._abstract_http._abstract_http_parser", + "name": "AbstractHttpParser", + "parsedDocstring": { + "text": "Parser used for parsing HTTP response and inspecting parsed result to find links or detect blocking." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_abstract_http/_abstract_http_parser.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 19 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedBy": [ + { + "name": "BeautifulSoupParser", + "target": "1661", + "type": "reference" + }, + { + "name": "NoParser", + "target": "1689", + "type": "reference" + }, + { + "name": "ParselParser", + "target": "1726", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1305, + "module": "statistics._models", + "name": "model_config", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 15 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "overwrites": { + "name": "StatisticsState.model_config", + "target": 2667, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Number representing how many times static http based crawling was used." + } + ] + }, + "flags": {}, + "groups": [], + "id": 1306, + "module": "crawlers._adaptive_playwright._adaptive_playwright_crawler_statistics", + "name": "http_only_request_handler_runs", + "parsedDocstring": { + "text": "Number representing how many times static http based crawling was used." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 17 + } + ], + "type": { + "name": "int", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Number representing how many times browser based crawling was used." + } + ] + }, + "flags": {}, + "groups": [], + "id": 1307, + "module": "crawlers._adaptive_playwright._adaptive_playwright_crawler_statistics", + "name": "browser_request_handler_runs", + "parsedDocstring": { + "text": "Number representing how many times browser based crawling was used." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 20 + } + ], + "type": { + "name": "int", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Number representing how many times the predictor gave incorrect prediction." + } + ] + }, + "flags": {}, + "groups": [], + "id": 1308, + "module": "crawlers._adaptive_playwright._adaptive_playwright_crawler_statistics", + "name": "rendering_type_mispredictions", + "parsedDocstring": { + "text": "Number representing how many times the predictor gave incorrect prediction." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 23 + } + ], + "type": { + "name": "int", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 4068, + "module": "statistics._models", + "name": "stats_id", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 62 + } + ], + "type": { + "name": "Annotated[int | None, Field(alias='statsId')]", + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + }, + "inheritedFrom": { + "name": "StatisticsState.stats_id", + "target": 2668, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 4069, + "module": "statistics._models", + "name": "requests_finished", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 64 + } + ], + "type": { + "name": "int", + "type": "reference" + }, + "inheritedFrom": { + "name": "StatisticsState.requests_finished", + "target": 2669, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 4070, + "module": "statistics._models", + "name": "requests_failed", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 65 + } + ], + "type": { + "name": "int", + "type": "reference" + }, + "inheritedFrom": { + "name": "StatisticsState.requests_failed", + "target": 2670, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 4071, + "module": "statistics._models", + "name": "requests_retries", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 66 + } + ], + "type": { + "name": "int", + "type": "reference" + }, + "inheritedFrom": { + "name": "StatisticsState.requests_retries", + "target": 2671, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 4072, + "module": "statistics._models", + "name": "requests_failed_per_minute", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 67 + } + ], + "type": { + "name": "float", + "type": "reference" + }, + "inheritedFrom": { + "name": "StatisticsState.requests_failed_per_minute", + "target": 2672, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 4073, + "module": "statistics._models", + "name": "requests_finished_per_minute", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 68 + } + ], + "type": { + "name": "float", + "type": "reference" + }, + "inheritedFrom": { + "name": "StatisticsState.requests_finished_per_minute", + "target": 2673, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 4074, + "module": "statistics._models", + "name": "request_min_duration", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 69 + } + ], + "type": { + "name": "Annotated[timedelta_ms | None, Field(alias='requestMinDurationMillis')]", + "type": "union", + "types": [ + { + "type": "reference", + "name": "timedelta_ms", + "target": "790" + }, + { + "type": "literal", + "value": null + } + ] + }, + "inheritedFrom": { + "name": "StatisticsState.request_min_duration", + "target": 2674, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 4075, + "module": "statistics._models", + "name": "request_max_duration", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 70 + } + ], + "type": { + "name": "Annotated[timedelta_ms | None, Field(alias='requestMaxDurationMillis')]", + "type": "union", + "types": [ + { + "type": "reference", + "name": "timedelta_ms", + "target": "790" + }, + { + "type": "literal", + "value": null + } + ] + }, + "inheritedFrom": { + "name": "StatisticsState.request_max_duration", + "target": 2675, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 4076, + "module": "statistics._models", + "name": "request_total_failed_duration", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 71 + } + ], + "type": { + "name": "timedelta_ms", + "type": "reference", + "target": "790" + }, + "inheritedFrom": { + "name": "StatisticsState.request_total_failed_duration", + "target": 2676, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 4077, + "module": "statistics._models", + "name": "request_total_finished_duration", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 74 + } + ], + "type": { + "name": "timedelta_ms", + "type": "reference", + "target": "790" + }, + "inheritedFrom": { + "name": "StatisticsState.request_total_finished_duration", + "target": 2677, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 4078, + "module": "statistics._models", + "name": "crawler_started_at", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 77 + } + ], + "type": { + "name": "Annotated[datetime | None, Field(alias='crawlerStartedAt')]", + "type": "union", + "types": [ + { + "type": "reference", + "name": "datetime" + }, + { + "type": "literal", + "value": null + } + ] + }, + "inheritedFrom": { + "name": "StatisticsState.crawler_started_at", + "target": 2678, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 4079, + "module": "statistics._models", + "name": "crawler_last_started_at", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 78 + } + ], + "type": { + "name": "Annotated[datetime | None, Field(alias='crawlerLastStartTimestamp')]", + "type": "union", + "types": [ + { + "type": "reference", + "name": "datetime" + }, + { + "type": "literal", + "value": null + } + ] + }, + "inheritedFrom": { + "name": "StatisticsState.crawler_last_started_at", + "target": 2679, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 4080, + "module": "statistics._models", + "name": "crawler_finished_at", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 79 + } + ], + "type": { + "name": "Annotated[datetime | None, Field(alias='crawlerFinishedAt')]", + "type": "union", + "types": [ + { + "type": "reference", + "name": "datetime" + }, + { + "type": "literal", + "value": null + } + ] + }, + "inheritedFrom": { + "name": "StatisticsState.crawler_finished_at", + "target": 2680, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 4081, + "module": "statistics._models", + "name": "stats_persisted_at", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 94 + } + ], + "type": { + "name": "Annotated[ datetime | None, Field(alias='statsPersistedAt'), PlainSerializer(lambda _: datetime.now(timezone.utc)) ]", + "type": "union", + "types": [ + { + "type": "reference", + "name": "datetime" + }, + { + "type": "literal", + "value": null + } + ] + }, + "inheritedFrom": { + "name": "StatisticsState.stats_persisted_at", + "target": 2681, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 4082, + "module": "statistics._models", + "name": "request_retry_histogram", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 97 + } + ], + "type": { + "name": "dict", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "reference", + "name": "int" + } + ] + }, + "inheritedFrom": { + "name": "StatisticsState.request_retry_histogram", + "target": 2682, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4083, + "module": "statistics._models", + "name": "model_post_init", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 110 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 2684, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "model_post_init", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2685, + "kind": 32768, + "kindString": "Parameter", + "name": "__context", + "type": { + "name": "Any", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "inheritedFrom": { + "name": "StatisticsState.model_post_init", + "target": 2683, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "StatisticsState.model_post_init", + "target": 2683, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "args": ".setter", + "name": "crawler_runtime" + } + ], + "flags": {}, + "groups": [], + "id": 4084, + "module": "statistics._models", + "name": "crawler_runtime", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 114 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "inheritedFrom": { + "name": "StatisticsState.crawler_runtime", + "target": 2686, + "type": "reference" + }, + "overwrites": { + "name": "StatisticsState.crawler_runtime", + "target": 2687, + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 2688, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "crawler_runtime", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2689, + "kind": 32768, + "kindString": "Parameter", + "name": "value", + "type": { + "name": "timedelta", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "inheritedFrom": { + "name": "StatisticsState.crawler_runtime", + "target": 2686, + "type": "reference" + }, + "overwrites": { + "name": "StatisticsState.crawler_runtime", + "target": 2687, + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "args": "(alias='crawlerRuntimeMillis')", + "name": "computed_field" + } + ], + "flags": {}, + "groups": [], + "id": 4085, + "module": "statistics._models", + "name": "crawler_runtime_for_serialization", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 132 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 2691, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "crawler_runtime_for_serialization", + "parameters": [], + "type": { + "name": "timedelta", + "type": "reference" + }, + "inheritedFrom": { + "name": "StatisticsState.crawler_runtime_for_serialization", + "target": 2690, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "StatisticsState.crawler_runtime_for_serialization", + "target": 2690, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "args": "(alias='requestTotalDurationMillis', return_type=timedelta_ms)", + "name": "computed_field" + }, + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 4086, + "module": "statistics._models", + "name": "request_total_duration", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 140 + } + ], + "type": { + "name": "timedelta", + "type": "reference" + }, + "inheritedFrom": { + "name": "StatisticsState.request_total_duration", + "target": 2692, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "args": "(alias='requestAvgFailedDurationMillis', return_type=timedelta_ms | None)", + "name": "computed_field" + }, + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 4087, + "module": "statistics._models", + "name": "request_avg_failed_duration", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 145 + } + ], + "type": { + "name": "timedelta | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "timedelta" + }, + { + "type": "literal", + "value": null + } + ] + }, + "inheritedFrom": { + "name": "StatisticsState.request_avg_failed_duration", + "target": 2693, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "args": "(alias='requestAvgFinishedDurationMillis', return_type=timedelta_ms | None)", + "name": "computed_field" + }, + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 4088, + "module": "statistics._models", + "name": "request_avg_finished_duration", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 150 + } + ], + "type": { + "name": "timedelta | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "timedelta" + }, + { + "type": "literal", + "value": null + } + ] + }, + "inheritedFrom": { + "name": "StatisticsState.request_avg_finished_duration", + "target": 2694, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "args": "(alias='requestsTotal')", + "name": "computed_field" + }, + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 4089, + "module": "statistics._models", + "name": "requests_total", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 155 + } + ], + "type": { + "name": "int", + "type": "reference" + }, + "inheritedFrom": { + "name": "StatisticsState.requests_total", + "target": 2695, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Statistic data about a crawler run with additional information related to adaptive crawling." + } + ] + }, + "decorations": [ + { + "args": "('Statistics')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 4085, + 4083 + ], + "title": "Methods" + }, + { + "children": [ + 1307, + 4080, + 4079, + 4084, + 4078, + 1306, + 1305, + 1308, + 4087, + 4088, + 4075, + 4074, + 4082, + 4086, + 4076, + 4077, + 4070, + 4072, + 4069, + 4073, + 4071, + 4089, + 4068, + 4081 + ], + "title": "Properties" + } + ], + "id": 1304, + "module": "crawlers._adaptive_playwright._adaptive_playwright_crawler_statistics", + "name": "AdaptivePlaywrightCrawlerStatisticState", + "parsedDocstring": { + "text": "Statistic data about a crawler run with additional information related to adaptive crawling." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 12 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "StatisticsState", + "target": "2666", + "type": "reference" + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1309, + "module": "crawlers._adaptive_playwright._rendering_type_predictor", + "name": "logger", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 27 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1310, + "module": "crawlers._adaptive_playwright._rendering_type_predictor", + "name": "UrlComponents", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 29 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1311, + "module": "crawlers._adaptive_playwright._rendering_type_predictor", + "name": "RenderingType", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 30 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1312, + "module": "crawlers._adaptive_playwright._rendering_type_predictor", + "name": "FeatureVector", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 31 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1314, + "module": "crawlers._adaptive_playwright._rendering_type_predictor", + "name": "model_config", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 35 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1315, + "module": "crawlers._adaptive_playwright._rendering_type_predictor", + "name": "model", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 37 + } + ], + "type": { + "name": "LogisticRegression", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1316, + "module": "crawlers._adaptive_playwright._rendering_type_predictor", + "name": "labels_coefficients", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 44 + } + ], + "type": { + "name": "defaultdict", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "float" + } + ] + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 1316, + 1315, + 1314 + ], + "title": "Properties" + } + ], + "id": 1313, + "module": "crawlers._adaptive_playwright._rendering_type_predictor", + "name": "RenderingTypePredictorState", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 34 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Recommended rendering type." + } + ] + }, + "flags": {}, + "groups": [], + "id": 1318, + "module": "crawlers._adaptive_playwright._rendering_type_predictor", + "name": "rendering_type", + "parsedDocstring": { + "text": "Recommended rendering type." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 52 + } + ], + "type": { + "name": "RenderingType", + "type": "reference", + "target": "1311" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Recommended rendering detection probability. Expected values between 0-1.\n\nZero represents absolute confidence in `rendering_type` recommendation.\nOne represents no confidence in `rendering_type` recommendation." + } + ] + }, + "flags": {}, + "groups": [], + "id": 1319, + "module": "crawlers._adaptive_playwright._rendering_type_predictor", + "name": "detection_probability_recommendation", + "parsedDocstring": { + "text": "Recommended rendering detection probability. Expected values between 0-1.\n\nZero represents absolute confidence in `rendering_type` recommendation.\nOne represents no confidence in `rendering_type` recommendation." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 54 + } + ], + "type": { + "name": "float", + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Rendering type recommendation with detection probability recommendation." + } + ] + }, + "decorations": [ + { + "args": "('Other')", + "name": "docs_group" + }, + { + "args": "(frozen=True)", + "name": "dataclass" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 1319, + 1318 + ], + "title": "Properties" + } + ], + "id": 1317, + "module": "crawlers._adaptive_playwright._rendering_type_predictor", + "name": "RenderingTypePrediction", + "parsedDocstring": { + "text": "Rendering type recommendation with detection probability recommendation." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 49 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1321, + "module": "crawlers._adaptive_playwright._rendering_type_predictor", + "name": "__init__", + "parsedDocstring": { + "text": "Initialize a new instance." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 65 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance." + } + ] + }, + "flags": {}, + "id": 1322, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get `RenderingTypePrediction` based on the input request.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1323, + "module": "crawlers._adaptive_playwright._rendering_type_predictor", + "name": "predict", + "parsedDocstring": { + "text": "Get `RenderingTypePrediction` based on the input request.\n", + "args": { + "request": "`Request` instance for which the prediction is made." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 71 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get `RenderingTypePrediction` based on the input request.\n" + } + ] + }, + "flags": {}, + "id": 1324, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "predict", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "`Request` instance for which the prediction is made." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1325, + "kind": 32768, + "kindString": "Parameter", + "name": "request", + "type": { + "name": "Request", + "type": "reference", + "target": "150" + } + } + ], + "type": { + "name": "RenderingTypePrediction", + "type": "reference", + "target": "1317" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Store prediction results and retrain the model.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1326, + "module": "crawlers._adaptive_playwright._rendering_type_predictor", + "name": "store_result", + "parsedDocstring": { + "text": "Store prediction results and retrain the model.\n", + "args": { + "request": "Used request.", + "rendering_type": "Known suitable `RenderingType`." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 79 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Store prediction results and retrain the model.\n" + } + ] + }, + "flags": {}, + "id": 1327, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "store_result", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Used request." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1328, + "kind": 32768, + "kindString": "Parameter", + "name": "request", + "type": { + "name": "Request", + "type": "reference", + "target": "150" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Known suitable `RenderingType`." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1329, + "kind": 32768, + "kindString": "Parameter", + "name": "rendering_type", + "type": { + "name": "RenderingType", + "type": "reference", + "target": "1311" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize additional resources required for the predictor operation." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1330, + "module": "crawlers._adaptive_playwright._rendering_type_predictor", + "name": "initialize", + "parsedDocstring": { + "text": "Initialize additional resources required for the predictor operation." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 87 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize additional resources required for the predictor operation." + } + ] + }, + "flags": {}, + "id": 1331, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "initialize", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Clear and release additional resources used by the predictor." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1332, + "module": "crawlers._adaptive_playwright._rendering_type_predictor", + "name": "clear", + "parsedDocstring": { + "text": "Clear and release additional resources used by the predictor." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 93 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Clear and release additional resources used by the predictor." + } + ] + }, + "flags": {}, + "id": 1333, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "clear", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize the predictor upon entering the context manager." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1334, + "module": "crawlers._adaptive_playwright._rendering_type_predictor", + "name": "__aenter__", + "parsedDocstring": { + "text": "Initialize the predictor upon entering the context manager." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 99 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize the predictor upon entering the context manager." + } + ] + }, + "flags": {}, + "id": 1335, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "__aenter__", + "parameters": [], + "type": { + "name": "RenderingTypePredictor", + "type": "reference", + "target": "1320" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Clear the predictor upon exiting the context manager." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1336, + "module": "crawlers._adaptive_playwright._rendering_type_predictor", + "name": "__aexit__", + "parsedDocstring": { + "text": "Clear the predictor upon exiting the context manager." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 104 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Clear the predictor upon exiting the context manager." + } + ] + }, + "flags": {}, + "id": 1337, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "__aexit__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1338, + "kind": 32768, + "kindString": "Parameter", + "name": "exc_type", + "type": { + "name": "type[BaseException] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "type", + "typeArguments": [ + { + "type": "reference", + "name": "BaseException" + } + ], + "target": "981" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1339, + "kind": 32768, + "kindString": "Parameter", + "name": "exc_value", + "type": { + "name": "BaseException | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "BaseException" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1340, + "kind": 32768, + "kindString": "Parameter", + "name": "exc_traceback", + "type": { + "name": "TracebackType | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "TracebackType" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Stores rendering type for previously crawled URLs and predicts the rendering type for unvisited urls." + } + ] + }, + "decorations": [ + { + "args": "('Other')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 1334, + 1336, + 1321, + 1332, + 1330, + 1323, + 1326 + ], + "title": "Methods" + } + ], + "id": 1320, + "module": "crawlers._adaptive_playwright._rendering_type_predictor", + "name": "RenderingTypePredictor", + "parsedDocstring": { + "text": "Stores rendering type for previously crawled URLs and predicts the rendering type for unvisited urls." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 62 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedBy": [ + { + "name": "DefaultRenderingTypePredictor", + "target": "1341", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1342, + "module": "crawlers._adaptive_playwright._rendering_type_predictor", + "name": "__init__", + "parsedDocstring": { + "text": "Initialize a new instance.\n", + "args": { + "detection_ratio": "A number between 0 and 1 that determines the desired ratio of rendering type detections.", + "persist_state_key": "Key in the key-value storage where the trained model parameters will be saved.\nIf None, defaults to 'rendering-type-predictor-state'.", + "persistence_enabled": "Whether to enable persistence of the trained model parameters for reuse." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 122 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n" + } + ] + }, + "flags": {}, + "id": 1343, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A number between 0 and 1 that determines the desired ratio of rendering type detections." + } + ] + }, + "defaultValue": "0.1", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1344, + "kind": 32768, + "kindString": "Parameter", + "name": "detection_ratio", + "type": { + "name": "float", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Whether to enable persistence of the trained model parameters for reuse." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1345, + "kind": 32768, + "kindString": "Parameter", + "name": "persistence_enabled", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Key in the key-value storage where the trained model parameters will be saved.\nIf None, defaults to 'rendering-type-predictor-state'." + } + ] + }, + "defaultValue": "'rendering-type-predictor-state'", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1346, + "kind": 32768, + "kindString": "Parameter", + "name": "persist_state_key", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "RenderingTypePredictor.__init__", + "target": 1321, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RenderingTypePredictor.__init__", + "target": 1321, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get current state of the predictor." + } + ] + }, + "decorations": [ + { + "name": "override" + } + ], + "flags": {}, + "groups": [], + "id": 1347, + "module": "crawlers._adaptive_playwright._rendering_type_predictor", + "name": "initialize", + "parsedDocstring": { + "text": "Get current state of the predictor." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 161 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get current state of the predictor." + } + ] + }, + "flags": {}, + "id": 1348, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "initialize", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "RenderingTypePredictor.initialize", + "target": 1330, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RenderingTypePredictor.initialize", + "target": 1330, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Clear the predictor state." + } + ] + }, + "decorations": [ + { + "name": "override" + } + ], + "flags": {}, + "groups": [], + "id": 1349, + "module": "crawlers._adaptive_playwright._rendering_type_predictor", + "name": "clear", + "parsedDocstring": { + "text": "Clear the predictor state." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 169 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Clear the predictor state." + } + ] + }, + "flags": {}, + "id": 1350, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "clear", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "RenderingTypePredictor.clear", + "target": 1332, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RenderingTypePredictor.clear", + "target": 1332, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get `RenderingTypePrediction` based on the input request.\n" + } + ] + }, + "decorations": [ + { + "name": "override" + } + ], + "flags": {}, + "groups": [], + "id": 1351, + "module": "crawlers._adaptive_playwright._rendering_type_predictor", + "name": "predict", + "parsedDocstring": { + "text": "Get `RenderingTypePrediction` based on the input request.\n", + "args": { + "request": "`Request` instance for which the prediction is made." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 177 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get `RenderingTypePrediction` based on the input request.\n" + } + ] + }, + "flags": {}, + "id": 1352, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "predict", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "`Request` instance for which the prediction is made." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1353, + "kind": 32768, + "kindString": "Parameter", + "name": "request", + "type": { + "name": "Request", + "type": "reference", + "target": "150" + } + } + ], + "type": { + "name": "RenderingTypePrediction", + "type": "reference", + "target": "1317" + }, + "overwrites": { + "name": "RenderingTypePredictor.predict", + "target": 1323, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RenderingTypePredictor.predict", + "target": 1323, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Store prediction results and retrain the model.\n" + } + ] + }, + "decorations": [ + { + "name": "override" + } + ], + "flags": {}, + "groups": [], + "id": 1354, + "module": "crawlers._adaptive_playwright._rendering_type_predictor", + "name": "store_result", + "parsedDocstring": { + "text": "Store prediction results and retrain the model.\n", + "args": { + "request": "Used `Request` instance.", + "rendering_type": "Known suitable `RenderingType` for the used `Request` instance." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 209 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Store prediction results and retrain the model.\n" + } + ] + }, + "flags": {}, + "id": 1355, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "store_result", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Used `Request` instance." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1356, + "kind": 32768, + "kindString": "Parameter", + "name": "request", + "type": { + "name": "Request", + "type": "reference", + "target": "150" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Known suitable `RenderingType` for the used `Request` instance." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1357, + "kind": 32768, + "kindString": "Parameter", + "name": "rendering_type", + "type": { + "name": "RenderingType", + "type": "reference", + "target": "1311" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "RenderingTypePredictor.store_result", + "target": 1326, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RenderingTypePredictor.store_result", + "target": 1326, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize the predictor upon entering the context manager." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4066, + "module": "crawlers._adaptive_playwright._rendering_type_predictor", + "name": "__aenter__", + "parsedDocstring": { + "text": "Initialize the predictor upon entering the context manager." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 99 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize the predictor upon entering the context manager." + } + ] + }, + "flags": {}, + "id": 1335, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "__aenter__", + "parameters": [], + "type": { + "name": "RenderingTypePredictor", + "type": "reference", + "target": "1320" + }, + "inheritedFrom": { + "name": "RenderingTypePredictor.__aenter__", + "target": 1334, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "RenderingTypePredictor.__aenter__", + "target": 1334, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Clear the predictor upon exiting the context manager." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4067, + "module": "crawlers._adaptive_playwright._rendering_type_predictor", + "name": "__aexit__", + "parsedDocstring": { + "text": "Clear the predictor upon exiting the context manager." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 104 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Clear the predictor upon exiting the context manager." + } + ] + }, + "flags": {}, + "id": 1337, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "__aexit__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1338, + "kind": 32768, + "kindString": "Parameter", + "name": "exc_type", + "type": { + "name": "type[BaseException] | None", + "type": "reference" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1339, + "kind": 32768, + "kindString": "Parameter", + "name": "exc_value", + "type": { + "name": "BaseException | None", + "type": "reference" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1340, + "kind": 32768, + "kindString": "Parameter", + "name": "exc_traceback", + "type": { + "name": "TracebackType | None", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "RenderingTypePredictor.__aexit__", + "target": 1336, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "RenderingTypePredictor.__aexit__", + "target": 1336, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Stores rendering type for previously crawled URLs and predicts the rendering type for unvisited urls.\n\n`RenderingTypePredictor` implementation based on logistic regression:\nhttps://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html" + } + ] + }, + "decorations": [ + { + "args": "('Other')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 4066, + 4067, + 1342, + 1349, + 1347, + 1351, + 1354 + ], + "title": "Methods" + } + ], + "id": 1341, + "module": "crawlers._adaptive_playwright._rendering_type_predictor", + "name": "DefaultRenderingTypePredictor", + "parsedDocstring": { + "text": "Stores rendering type for previously crawled URLs and predicts the rendering type for unvisited urls.\n\n`RenderingTypePredictor` implementation based on logistic regression:\nhttps://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 115 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "RenderingTypePredictor", + "target": "1320", + "type": "reference" + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get list of url components where first component is host name." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1358, + "module": "crawlers._adaptive_playwright._rendering_type_predictor", + "name": "get_url_components", + "parsedDocstring": { + "text": "Get list of url components where first component is host name." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 250 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get list of url components where first component is host name." + } + ] + }, + "flags": {}, + "id": 1359, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "get_url_components", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1360, + "kind": 32768, + "kindString": "Parameter", + "name": "url", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "UrlComponents", + "type": "reference", + "target": "1310" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Calculate url similarity based on host name and path components similarity.\n\nReturn 0 if different host names.\nCompare path components using jaro-wrinkler method and assign 1 or 0 value based on similarity_cutoff for each\npath component. Return their weighted average." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1361, + "module": "crawlers._adaptive_playwright._rendering_type_predictor", + "name": "calculate_url_similarity", + "parsedDocstring": { + "text": "Calculate url similarity based on host name and path components similarity.\n\nReturn 0 if different host names.\nCompare path components using jaro-wrinkler method and assign 1 or 0 value based on similarity_cutoff for each\npath component. Return their weighted average." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 258 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Calculate url similarity based on host name and path components similarity.\n\nReturn 0 if different host names.\nCompare path components using jaro-wrinkler method and assign 1 or 0 value based on similarity_cutoff for each\npath component. Return their weighted average." + } + ] + }, + "flags": {}, + "id": 1362, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "calculate_url_similarity", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1363, + "kind": 32768, + "kindString": "Parameter", + "name": "url_1", + "type": { + "name": "UrlComponents", + "type": "reference", + "target": "1310" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1364, + "kind": 32768, + "kindString": "Parameter", + "name": "url_2", + "type": { + "name": "UrlComponents", + "type": "reference", + "target": "1310" + } + } + ], + "type": { + "name": "float", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a default comparator function for evaluating request handler results." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1365, + "module": "crawlers._adaptive_playwright._result_comparator", + "name": "create_default_comparator", + "parsedDocstring": { + "text": "Create a default comparator function for evaluating request handler results." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_adaptive_playwright/_result_comparator.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 11 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a default comparator function for evaluating request handler results." + } + ] + }, + "flags": {}, + "id": 1366, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "create_default_comparator", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1367, + "kind": 32768, + "kindString": "Parameter", + "name": "result_checker", + "type": { + "name": "Callable[[RequestHandlerRunResult], bool] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Callable", + "typeArguments": [ + { + "type": "reference", + "name": "[RequestHandlerRunResult]" + }, + { + "type": "reference", + "name": "bool" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "Callable", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "[RequestHandlerRunResult, RequestHandlerRunResult]" + }, + { + "type": "reference", + "name": "bool" + } + ] + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Compare results by comparing all their parts.\n\nComparison of `add_requests_calls` will consider same url requests with different parameters as different\nFor example following two request will be considered as different requests:\nhttps://sdk.apify.com/docs/guides/getting-started\nhttps://sdk.apify.com/docs/guides/getting-started?__hsfp=1136113150&__hssc=7591405.1.173549427712" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1368, + "module": "crawlers._adaptive_playwright._result_comparator", + "name": "full_result_comparator", + "parsedDocstring": { + "text": "Compare results by comparing all their parts.\n\nComparison of `add_requests_calls` will consider same url requests with different parameters as different\nFor example following two request will be considered as different requests:\nhttps://sdk.apify.com/docs/guides/getting-started\nhttps://sdk.apify.com/docs/guides/getting-started?__hsfp=1136113150&__hssc=7591405.1.173549427712" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_adaptive_playwright/_result_comparator.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 22 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Compare results by comparing all their parts.\n\nComparison of `add_requests_calls` will consider same url requests with different parameters as different\nFor example following two request will be considered as different requests:\nhttps://sdk.apify.com/docs/guides/getting-started\nhttps://sdk.apify.com/docs/guides/getting-started?__hsfp=1136113150&__hssc=7591405.1.173549427712" + } + ] + }, + "flags": {}, + "id": 1369, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "full_result_comparator", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1370, + "kind": 32768, + "kindString": "Parameter", + "name": "result_1", + "type": { + "name": "RequestHandlerRunResult", + "type": "reference", + "target": "394" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1371, + "kind": 32768, + "kindString": "Parameter", + "name": "result_2", + "type": { + "name": "RequestHandlerRunResult", + "type": "reference", + "target": "394" + } + } + ], + "type": { + "name": "bool", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Compare results by comparing their push data calls. Ignore other parts of results in comparison." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1372, + "module": "crawlers._adaptive_playwright._result_comparator", + "name": "push_data_only_comparator", + "parsedDocstring": { + "text": "Compare results by comparing their push data calls. Ignore other parts of results in comparison." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_adaptive_playwright/_result_comparator.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 37 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Compare results by comparing their push data calls. Ignore other parts of results in comparison." + } + ] + }, + "flags": {}, + "id": 1373, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "push_data_only_comparator", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1374, + "kind": 32768, + "kindString": "Parameter", + "name": "result_1", + "type": { + "name": "RequestHandlerRunResult", + "type": "reference", + "target": "394" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1375, + "kind": 32768, + "kindString": "Parameter", + "name": "result_2", + "type": { + "name": "RequestHandlerRunResult", + "type": "reference", + "target": "394" + } + } + ], + "type": { + "name": "bool", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1376, + "module": "crawlers._adaptive_playwright._utils", + "name": "sklearn_model_validator", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_adaptive_playwright/_utils.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 7 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 1377, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "sklearn_model_validator", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1378, + "kind": 32768, + "kindString": "Parameter", + "name": "v", + "type": { + "name": "LogisticRegression | dict[str, Any]", + "type": "union", + "types": [ + { + "type": "reference", + "name": "LogisticRegression" + }, + { + "type": "reference", + "name": "dict", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Any" + } + ] + } + ] + } + } + ], + "type": { + "name": "LogisticRegression", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1379, + "module": "crawlers._adaptive_playwright._utils", + "name": "sklearn_model_serializer", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_adaptive_playwright/_utils.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 21 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 1380, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "sklearn_model_serializer", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1381, + "kind": 32768, + "kindString": "Parameter", + "name": "model", + "type": { + "name": "LogisticRegression", + "type": "reference" + } + } + ], + "type": { + "name": "dict", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Any" + } + ] + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1382, + "module": "crawlers._adaptive_playwright._adaptive_playwright_crawler", + "name": "TStaticParseResult", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 52 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1383, + "module": "crawlers._adaptive_playwright._adaptive_playwright_crawler", + "name": "TStaticSelectResult", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 53 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1384, + "module": "crawlers._adaptive_playwright._adaptive_playwright_crawler", + "name": "TStaticCrawlingContext", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 54 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1386, + "module": "statistics._statistics", + "name": "__init__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 63 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 2709, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2710, + "kind": 32768, + "kindString": "Parameter", + "name": "persistence_enabled", + "type": { + "name": "bool | Literal['explicit_only']", + "type": "union", + "types": [ + { + "type": "reference", + "name": "bool" + }, + { + "type": "reference", + "name": "Literal", + "typeArguments": [ + { + "type": "literal", + "value": "explicit_only" + } + ] + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2711, + "kind": 32768, + "kindString": "Parameter", + "name": "persist_state_kvs_name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2712, + "kind": 32768, + "kindString": "Parameter", + "name": "persist_state_key", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2713, + "kind": 32768, + "kindString": "Parameter", + "name": "persist_state_kvs_factory", + "type": { + "name": "Callable[[], Coroutine[None, None, KeyValueStore]] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Callable", + "typeArguments": [ + { + "type": "reference", + "name": "[]" + }, + { + "type": "reference", + "name": "Coroutine", + "typeArguments": [ + { + "type": "literal", + "value": null + }, + { + "type": "literal", + "value": null + }, + { + "type": "reference", + "name": "KeyValueStore", + "target": "3700" + } + ] + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "'Statistics'", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2714, + "kind": 32768, + "kindString": "Parameter", + "name": "log_message", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2715, + "kind": 32768, + "kindString": "Parameter", + "name": "periodic_message_logger", + "type": { + "name": "Logger | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Logger" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "timedelta(minutes=1)", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2716, + "kind": 32768, + "kindString": "Parameter", + "name": "log_interval", + "type": { + "name": "timedelta", + "type": "reference" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2717, + "kind": 32768, + "kindString": "Parameter", + "name": "state_model", + "type": { + "name": "type", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "TStatisticsState", + "target": "1263" + } + ], + "target": "981" + } + }, + { + "defaultValue": "'table'", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2718, + "kind": 32768, + "kindString": "Parameter", + "name": "statistics_log_format", + "type": { + "name": "Literal", + "type": "reference", + "typeArguments": [ + { + "type": "literal", + "value": "table" + }, + { + "type": "literal", + "value": "inline" + } + ] + } + }, + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2719, + "kind": 32768, + "kindString": "Parameter", + "name": "save_error_snapshots", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "Statistics.__init__", + "target": 2708, + "type": "reference" + } + } + ], + "overwrites": { + "name": "Statistics.__init__", + "target": 2708, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Subscribe to events and start collecting statistics.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1388, + "module": "statistics._statistics", + "name": "__aenter__", + "parsedDocstring": { + "text": "Subscribe to events and start collecting statistics.\n" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 66 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Subscribe to events and start collecting statistics.\n" + } + ] + }, + "flags": {}, + "id": 2736, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "__aenter__", + "parameters": [], + "type": { + "name": "Self", + "type": "reference" + }, + "overwrites": { + "name": "Statistics.__aenter__", + "target": 2735, + "type": "reference" + } + } + ], + "overwrites": { + "name": "Statistics.__aenter__", + "target": 2735, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Stop collecting statistics.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1390, + "module": "statistics._statistics", + "name": "__aexit__", + "parsedDocstring": { + "text": "Stop collecting statistics.\n" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 71 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Stop collecting statistics.\n" + } + ] + }, + "flags": {}, + "id": 2738, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "__aexit__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2739, + "kind": 32768, + "kindString": "Parameter", + "name": "exc_type", + "type": { + "name": "type[BaseException] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "type", + "typeArguments": [ + { + "type": "reference", + "name": "BaseException" + } + ], + "target": "981" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2740, + "kind": 32768, + "kindString": "Parameter", + "name": "exc_value", + "type": { + "name": "BaseException | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "BaseException" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2741, + "kind": 32768, + "kindString": "Parameter", + "name": "exc_traceback", + "type": { + "name": "TracebackType | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "TracebackType" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "Statistics.__aexit__", + "target": 2737, + "type": "reference" + } + } + ], + "overwrites": { + "name": "Statistics.__aexit__", + "target": 2737, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create near copy of the `Statistics` with replaced `state_model`." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4056, + "module": "statistics._statistics", + "name": "replace_state_model", + "parsedDocstring": { + "text": "Create near copy of the `Statistics` with replaced `state_model`." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_statistics.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 113 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create near copy of the `Statistics` with replaced `state_model`." + } + ] + }, + "flags": {}, + "id": 2721, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "replace_state_model", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2722, + "kind": 32768, + "kindString": "Parameter", + "name": "state_model", + "type": { + "name": "type[TNewStatisticsState]", + "type": "reference" + } + } + ], + "type": { + "name": "Statistics[TNewStatisticsState]", + "type": "reference" + }, + "inheritedFrom": { + "name": "Statistics.replace_state_model", + "target": 2720, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "Statistics.replace_state_model", + "target": 2720, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance with default state model `StatisticsState`." + } + ] + }, + "decorations": [ + { + "name": "staticmethod" + } + ], + "flags": {}, + "groups": [], + "id": 4057, + "module": "statistics._statistics", + "name": "with_default_state", + "parsedDocstring": { + "text": "Initialize a new instance with default state model `StatisticsState`." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_statistics.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 127 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance with default state model `StatisticsState`." + } + ] + }, + "flags": {}, + "id": 2724, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "with_default_state", + "parameters": [ + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2725, + "kind": 32768, + "kindString": "Parameter", + "name": "persistence_enabled", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2726, + "kind": 32768, + "kindString": "Parameter", + "name": "persist_state_kvs_name", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2727, + "kind": 32768, + "kindString": "Parameter", + "name": "persist_state_key", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2728, + "kind": 32768, + "kindString": "Parameter", + "name": "persist_state_kvs_factory", + "type": { + "name": "Callable[[], Coroutine[None, None, KeyValueStore]] | None", + "type": "reference" + } + }, + { + "defaultValue": "'Statistics'", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2729, + "kind": 32768, + "kindString": "Parameter", + "name": "log_message", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2730, + "kind": 32768, + "kindString": "Parameter", + "name": "periodic_message_logger", + "type": { + "name": "Logger | None", + "type": "reference" + } + }, + { + "defaultValue": "timedelta(minutes=1)", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2731, + "kind": 32768, + "kindString": "Parameter", + "name": "log_interval", + "type": { + "name": "timedelta", + "type": "reference" + } + }, + { + "defaultValue": "'table'", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2732, + "kind": 32768, + "kindString": "Parameter", + "name": "statistics_log_format", + "type": { + "name": "Literal['table', 'inline']", + "type": "reference" + } + }, + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2733, + "kind": 32768, + "kindString": "Parameter", + "name": "save_error_snapshots", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "Statistics[StatisticsState]", + "type": "reference" + }, + "inheritedFrom": { + "name": "Statistics.with_default_state", + "target": 2723, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "Statistics.with_default_state", + "target": 2723, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Indicate whether the context is active." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 4058, + "module": "statistics._statistics", + "name": "active", + "parsedDocstring": { + "text": "Indicate whether the context is active." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_statistics.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 154 + } + ], + "type": { + "name": "bool", + "type": "reference" + }, + "inheritedFrom": { + "name": "Statistics.active", + "target": 2734, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 4059, + "module": "statistics._statistics", + "name": "state", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_statistics.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 204 + } + ], + "type": { + "name": "TStatisticsState", + "type": "reference", + "target": "1263" + }, + "inheritedFrom": { + "name": "Statistics.state", + "target": 2742, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Increment the number of times a status code has been received." + } + ] + }, + "decorations": [ + { + "name": "ensure_context" + } + ], + "flags": {}, + "groups": [], + "id": 4060, + "module": "statistics._statistics", + "name": "register_status_code", + "parsedDocstring": { + "text": "Increment the number of times a status code has been received." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_statistics.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 208 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Increment the number of times a status code has been received." + } + ] + }, + "flags": {}, + "id": 2744, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "register_status_code", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2745, + "kind": 32768, + "kindString": "Parameter", + "name": "code", + "type": { + "name": "int", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "Statistics.register_status_code", + "target": 2743, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "Statistics.register_status_code", + "target": 2743, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Mark a request as started." + } + ] + }, + "decorations": [ + { + "name": "ensure_context" + } + ], + "flags": {}, + "groups": [], + "id": 4061, + "module": "statistics._statistics", + "name": "record_request_processing_start", + "parsedDocstring": { + "text": "Mark a request as started." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_statistics.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 215 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Mark a request as started." + } + ] + }, + "flags": {}, + "id": 2747, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "record_request_processing_start", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2748, + "kind": 32768, + "kindString": "Parameter", + "name": "request_id_or_key", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "Statistics.record_request_processing_start", + "target": 2746, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "Statistics.record_request_processing_start", + "target": 2746, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Mark a request as finished." + } + ] + }, + "decorations": [ + { + "name": "ensure_context" + } + ], + "flags": {}, + "groups": [], + "id": 4062, + "module": "statistics._statistics", + "name": "record_request_processing_finish", + "parsedDocstring": { + "text": "Mark a request as finished." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_statistics.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 222 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Mark a request as finished." + } + ] + }, + "flags": {}, + "id": 2750, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "record_request_processing_finish", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2751, + "kind": 32768, + "kindString": "Parameter", + "name": "request_id_or_key", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "Statistics.record_request_processing_finish", + "target": 2749, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "Statistics.record_request_processing_finish", + "target": 2749, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Mark a request as failed." + } + ] + }, + "decorations": [ + { + "name": "ensure_context" + } + ], + "flags": {}, + "groups": [], + "id": 4063, + "module": "statistics._statistics", + "name": "record_request_processing_failure", + "parsedDocstring": { + "text": "Mark a request as failed." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_statistics.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 244 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Mark a request as failed." + } + ] + }, + "flags": {}, + "id": 2753, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "record_request_processing_failure", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2754, + "kind": 32768, + "kindString": "Parameter", + "name": "request_id_or_key", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "Statistics.record_request_processing_failure", + "target": 2752, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "Statistics.record_request_processing_failure", + "target": 2752, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Calculate the current statistics." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4064, + "module": "statistics._statistics", + "name": "calculate", + "parsedDocstring": { + "text": "Calculate the current statistics." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_statistics.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 258 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Calculate the current statistics." + } + ] + }, + "flags": {}, + "id": 2756, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "calculate", + "parameters": [], + "type": { + "name": "FinalStatistics", + "type": "reference", + "target": "2649" + }, + "inheritedFrom": { + "name": "Statistics.calculate", + "target": 2755, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "Statistics.calculate", + "target": 2755, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Reset the statistics to their defaults and remove any persistent state." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4065, + "module": "statistics._statistics", + "name": "reset", + "parsedDocstring": { + "text": "Reset the statistics to their defaults and remove any persistent state." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_statistics.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 277 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Reset the statistics to their defaults and remove any persistent state." + } + ] + }, + "flags": {}, + "id": 2758, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "reset", + "parameters": [], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "Statistics.reset", + "target": 2757, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "Statistics.reset", + "target": 2757, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Statistics compliant object that is not supposed to do anything when entering/exiting context.\n\nTo be used in sub crawlers." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 1388, + 1390, + 1386, + 4064, + 4063, + 4062, + 4061, + 4060, + 4056, + 4065, + 4057 + ], + "title": "Methods" + }, + { + "children": [ + 4058, + 4059 + ], + "title": "Properties" + } + ], + "id": 1385, + "module": "crawlers._adaptive_playwright._adaptive_playwright_crawler", + "name": "_NonPersistentStatistics", + "parsedDocstring": { + "text": "Statistics compliant object that is not supposed to do anything when entering/exiting context.\n\nTo be used in sub crawlers." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 57 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "Statistics", + "target": "2707", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance. Recommended way to create instance is to call factory methods.\n\nRecommended factory methods: `with_beautifulsoup_static_parser`, `with_parsel_static_parser`.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1396, + "module": "crawlers._adaptive_playwright._adaptive_playwright_crawler", + "name": "__init__", + "parsedDocstring": { + "text": "Initialize a new instance. Recommended way to create instance is to call factory methods.\n\nRecommended factory methods: `with_beautifulsoup_static_parser`, `with_parsel_static_parser`.\n", + "args": { + "rendering_type_predictor": "Object that implements RenderingTypePredictor and is capable of predicting which\nrendering method should be used. If None, then `DefaultRenderingTypePredictor` is used.", + "result_checker": "Function that evaluates whether crawling result is valid or not.", + "result_comparator": "Function that compares two crawling results and decides whether they are equivalent.", + "static_parser": "Implementation of `AbstractHttpParser`. Parser that will be used for static crawling.", + "static_crawler_specific_kwargs": "`AbstractHttpCrawler` only kwargs that are passed to the sub crawler.", + "playwright_crawler_specific_kwargs": "`PlaywrightCrawler` only kwargs that are passed to the sub crawler.", + "statistics": "A custom `Statistics[AdaptivePlaywrightCrawlerStatisticState]` instance, allowing the use of\nnon-default configuration.", + "kwargs": "Additional keyword arguments to pass to the underlying `BasicCrawler`." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 119 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance. Recommended way to create instance is to call factory methods.\n\nRecommended factory methods: `with_beautifulsoup_static_parser`, `with_parsel_static_parser`.\n" + } + ] + }, + "flags": {}, + "id": 1397, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Implementation of `AbstractHttpParser`. Parser that will be used for static crawling." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 1398, + "kind": 32768, + "kindString": "Parameter", + "name": "static_parser", + "type": { + "name": "AbstractHttpParser", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "TStaticParseResult", + "target": "1382" + }, + { + "type": "reference", + "name": "TStaticSelectResult", + "target": "1383" + } + ], + "target": "1281" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Object that implements RenderingTypePredictor and is capable of predicting which\nrendering method should be used. If None, then `DefaultRenderingTypePredictor` is used." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1399, + "kind": 32768, + "kindString": "Parameter", + "name": "rendering_type_predictor", + "type": { + "name": "RenderingTypePredictor | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "RenderingTypePredictor", + "target": "1320" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Function that evaluates whether crawling result is valid or not." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1400, + "kind": 32768, + "kindString": "Parameter", + "name": "result_checker", + "type": { + "name": "Callable[[RequestHandlerRunResult], bool] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Callable", + "typeArguments": [ + { + "type": "reference", + "name": "[RequestHandlerRunResult]" + }, + { + "type": "reference", + "name": "bool" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Function that compares two crawling results and decides whether they are equivalent." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1401, + "kind": 32768, + "kindString": "Parameter", + "name": "result_comparator", + "type": { + "name": "Callable[[RequestHandlerRunResult, RequestHandlerRunResult], bool] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Callable", + "typeArguments": [ + { + "type": "reference", + "name": "[RequestHandlerRunResult, RequestHandlerRunResult]" + }, + { + "type": "reference", + "name": "bool" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "`PlaywrightCrawler` only kwargs that are passed to the sub crawler." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1402, + "kind": 32768, + "kindString": "Parameter", + "name": "playwright_crawler_specific_kwargs", + "type": { + "name": "_PlaywrightCrawlerAdditionalOptions | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "_PlaywrightCrawlerAdditionalOptions", + "target": "1835" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A custom `Statistics[AdaptivePlaywrightCrawlerStatisticState]` instance, allowing the use of\nnon-default configuration." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1403, + "kind": 32768, + "kindString": "Parameter", + "name": "statistics", + "type": { + "name": "Statistics[AdaptivePlaywrightCrawlerStatisticState] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Statistics", + "typeArguments": [ + { + "type": "reference", + "name": "AdaptivePlaywrightCrawlerStatisticState", + "target": "1304" + } + ], + "target": "2707" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The `Configuration` instance. Some of its properties are used as defaults for the crawler." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 1526, + "module": "crawlers._basic._basic_crawler", + "name": "configuration", + "parsedDocstring": { + "text": "The `Configuration` instance. Some of its properties are used as defaults for the crawler." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 116 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Configuration", + "target": "235" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The event manager for managing events for the crawler and all its components." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 1527, + "module": "crawlers._basic._basic_crawler", + "name": "event_manager", + "parsedDocstring": { + "text": "The event manager for managing events for the crawler and all its components." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 119 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "EventManager", + "target": "1907" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The storage client for managing storages for the crawler and all its components." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 1528, + "module": "crawlers._basic._basic_crawler", + "name": "storage_client", + "parsedDocstring": { + "text": "The storage client for managing storages for the crawler and all its components." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 122 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "StorageClient", + "target": "2784" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Manager of requests that should be processed by the crawler." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 1529, + "module": "crawlers._basic._basic_crawler", + "name": "request_manager", + "parsedDocstring": { + "text": "Manager of requests that should be processed by the crawler." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 125 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "RequestManager", + "target": "2339" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A custom `SessionPool` instance, allowing the use of non-default configuration." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 1530, + "module": "crawlers._basic._basic_crawler", + "name": "session_pool", + "parsedDocstring": { + "text": "A custom `SessionPool` instance, allowing the use of non-default configuration." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 128 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "SessionPool", + "target": "2498" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP proxy configuration used when making requests." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 1531, + "module": "crawlers._basic._basic_crawler", + "name": "proxy_configuration", + "parsedDocstring": { + "text": "HTTP proxy configuration used when making requests." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 131 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "ProxyConfiguration", + "target": "263" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP client used by `BasicCrawlingContext.send_request` method." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 1532, + "module": "crawlers._basic._basic_crawler", + "name": "http_client", + "parsedDocstring": { + "text": "HTTP client used by `BasicCrawlingContext.send_request` method." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 134 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "HttpClient", + "target": "2070" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Specifies the maximum number of retries allowed for a request if its processing fails.\nThis includes retries due to navigation errors or errors thrown from user-supplied functions\n(`request_handler`, `pre_navigation_hooks` etc.).\n\nThis limit does not apply to retries triggered by session rotation (see `max_session_rotations`)." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 1533, + "module": "crawlers._basic._basic_crawler", + "name": "max_request_retries", + "parsedDocstring": { + "text": "Specifies the maximum number of retries allowed for a request if its processing fails.\nThis includes retries due to navigation errors or errors thrown from user-supplied functions\n(`request_handler`, `pre_navigation_hooks` etc.).\n\nThis limit does not apply to retries triggered by session rotation (see `max_session_rotations`)." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 137 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum number of pages to open during a crawl. The crawl stops upon reaching this limit.\nSetting this value can help avoid infinite loops in misconfigured crawlers. `None` means no limit.\nDue to concurrency settings, the actual number of pages visited may slightly exceed this value." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 1534, + "module": "crawlers._basic._basic_crawler", + "name": "max_requests_per_crawl", + "parsedDocstring": { + "text": "Maximum number of pages to open during a crawl. The crawl stops upon reaching this limit.\nSetting this value can help avoid infinite loops in misconfigured crawlers. `None` means no limit.\nDue to concurrency settings, the actual number of pages visited may slightly exceed this value." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 144 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum number of session rotations per request. The crawler rotates the session if a proxy error occurs\nor if the website blocks the request.\n\nThe session rotations are not counted towards the `max_request_retries` limit." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 1535, + "module": "crawlers._basic._basic_crawler", + "name": "max_session_rotations", + "parsedDocstring": { + "text": "Maximum number of session rotations per request. The crawler rotates the session if a proxy error occurs\nor if the website blocks the request.\n\nThe session rotations are not counted towards the `max_request_retries` limit." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 149 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Specifies the maximum crawl depth. If set, the crawler will stop processing links beyond this depth.\nThe crawl depth starts at 0 for initial requests and increases with each subsequent level of links.\nRequests at the maximum depth will still be processed, but no new links will be enqueued from those requests.\nIf not set, crawling continues without depth restrictions." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 1536, + "module": "crawlers._basic._basic_crawler", + "name": "max_crawl_depth", + "parsedDocstring": { + "text": "Specifies the maximum crawl depth. If set, the crawler will stop processing links beyond this depth.\nThe crawl depth starts at 0 for initial requests and increases with each subsequent level of links.\nRequests at the maximum depth will still be processed, but no new links will be enqueued from those requests.\nIf not set, crawling continues without depth restrictions." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 156 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Enable the use of a session pool for managing sessions during crawling." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 1537, + "module": "crawlers._basic._basic_crawler", + "name": "use_session_pool", + "parsedDocstring": { + "text": "Enable the use of a session pool for managing sessions during crawling." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 163 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, the crawler attempts to bypass bot protections automatically." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 1538, + "module": "crawlers._basic._basic_crawler", + "name": "retry_on_blocked", + "parsedDocstring": { + "text": "If True, the crawler attempts to bypass bot protections automatically." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 166 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Settings to fine-tune concurrency levels." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 1539, + "module": "crawlers._basic._basic_crawler", + "name": "concurrency_settings", + "parsedDocstring": { + "text": "Settings to fine-tune concurrency levels." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 169 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "ConcurrencySettings", + "target": "328" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum duration allowed for a single request handler to run." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 1540, + "module": "crawlers._basic._basic_crawler", + "name": "request_handler_timeout", + "parsedDocstring": { + "text": "Maximum duration allowed for a single request handler to run." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 172 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "timedelta" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, the crawler stops immediately when any request handler error occurs." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 1541, + "module": "crawlers._basic._basic_crawler", + "name": "abort_on_error", + "parsedDocstring": { + "text": "If True, the crawler stops immediately when any request handler error occurs." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 175 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, the crawler will set up logging infrastructure automatically." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 1542, + "module": "crawlers._basic._basic_crawler", + "name": "configure_logging", + "parsedDocstring": { + "text": "If True, the crawler will set up logging infrastructure automatically." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 178 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If 'table', displays crawler statistics as formatted tables in logs. If 'inline', outputs statistics as plain\ntext log messages." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 1543, + "module": "crawlers._basic._basic_crawler", + "name": "statistics_log_format", + "parsedDocstring": { + "text": "If 'table', displays crawler statistics as formatted tables in logs. If 'inline', outputs statistics as plain\ntext log messages." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 181 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Literal", + "typeArguments": [ + { + "type": "literal", + "value": "table" + }, + { + "type": "literal", + "value": "inline" + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Flag that can keep crawler running even when there are no requests in queue." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 1544, + "module": "crawlers._basic._basic_crawler", + "name": "keep_alive", + "parsedDocstring": { + "text": "Flag that can keep crawler running even when there are no requests in queue." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 186 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Additional HTTP status codes to treat as errors, triggering automatic retries when encountered." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 1545, + "module": "crawlers._basic._basic_crawler", + "name": "additional_http_error_status_codes", + "parsedDocstring": { + "text": "Additional HTTP status codes to treat as errors, triggering automatic retries when encountered." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 189 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Iterable", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP status codes that are typically considered errors but should be treated as successful responses." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 1546, + "module": "crawlers._basic._basic_crawler", + "name": "ignore_http_error_status_codes", + "parsedDocstring": { + "text": "HTTP status codes that are typically considered errors but should be treated as successful responses." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 192 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Iterable", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If set to `True`, the crawler will automatically try to fetch the robots.txt file for each domain,\nand skip those that are not allowed. This also prevents disallowed URLs to be added via `EnqueueLinksFunction`." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 1547, + "module": "crawlers._basic._basic_crawler", + "name": "respect_robots_txt_file", + "parsedDocstring": { + "text": "If set to `True`, the crawler will automatically try to fetch the robots.txt file for each domain,\nand skip those that are not allowed. This also prevents disallowed URLs to be added via `EnqueueLinksFunction`." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 203 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Interval for logging the crawler status messages." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 1548, + "module": "crawlers._basic._basic_crawler", + "name": "status_message_logging_interval", + "parsedDocstring": { + "text": "Interval for logging the crawler status messages." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 207 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "timedelta" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Allows overriding the default status message. The default status message is provided in the parameters.\nReturning `None` suppresses the status message." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 1549, + "module": "crawlers._basic._basic_crawler", + "name": "status_message_callback", + "parsedDocstring": { + "text": "Allows overriding the default status message. The default status message is provided in the parameters.\nReturning `None` suppresses the status message." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 210 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Callable", + "typeArguments": [ + { + "type": "reference", + "name": "[StatisticsState, StatisticsState | None, str]" + }, + { + "type": "reference", + "name": "Awaitable", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + ] + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Identifier used for crawler state tracking. Use the same id across multiple crawlers to share state between\nthem." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 1550, + "module": "crawlers._basic._basic_crawler", + "name": "id", + "parsedDocstring": { + "text": "Identifier used for crawler state tracking. Use the same id across multiple crawlers to share state between\nthem." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 216 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "BasicCrawler.__init__", + "target": 1556, + "type": "reference" + } + } + ], + "overwrites": { + "name": "BasicCrawler.__init__", + "target": 1556, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create `AdaptivePlaywrightCrawler` that uses `BeautifulSoup` for parsing static content." + } + ] + }, + "decorations": [ + { + "name": "classmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1405, + "module": "crawlers._adaptive_playwright._adaptive_playwright_crawler", + "name": "with_beautifulsoup_static_parser", + "parsedDocstring": { + "text": "Create `AdaptivePlaywrightCrawler` that uses `BeautifulSoup` for parsing static content." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 235 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create `AdaptivePlaywrightCrawler` that uses `BeautifulSoup` for parsing static content." + } + ] + }, + "flags": {}, + "id": 1406, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "with_beautifulsoup_static_parser", + "parameters": [ + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1407, + "kind": 32768, + "kindString": "Parameter", + "name": "rendering_type_predictor", + "type": { + "name": "RenderingTypePredictor | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "RenderingTypePredictor", + "target": "1320" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1408, + "kind": 32768, + "kindString": "Parameter", + "name": "result_checker", + "type": { + "name": "Callable[[RequestHandlerRunResult], bool] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Callable", + "typeArguments": [ + { + "type": "reference", + "name": "[RequestHandlerRunResult]" + }, + { + "type": "reference", + "name": "bool" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1409, + "kind": 32768, + "kindString": "Parameter", + "name": "result_comparator", + "type": { + "name": "Callable[[RequestHandlerRunResult, RequestHandlerRunResult], bool] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Callable", + "typeArguments": [ + { + "type": "reference", + "name": "[RequestHandlerRunResult, RequestHandlerRunResult]" + }, + { + "type": "reference", + "name": "bool" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "'lxml'", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1410, + "kind": 32768, + "kindString": "Parameter", + "name": "parser_type", + "type": { + "name": "BeautifulSoupParserType", + "type": "reference", + "target": "1684" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1411, + "kind": 32768, + "kindString": "Parameter", + "name": "playwright_crawler_specific_kwargs", + "type": { + "name": "_PlaywrightCrawlerAdditionalOptions | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "_PlaywrightCrawlerAdditionalOptions", + "target": "1835" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1412, + "kind": 32768, + "kindString": "Parameter", + "name": "statistics", + "type": { + "name": "Statistics[StatisticsState] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Statistics", + "typeArguments": [ + { + "type": "reference", + "name": "StatisticsState", + "target": "2666" + } + ], + "target": "2707" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The `Configuration` instance. Some of its properties are used as defaults for the crawler." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 1526, + "module": "crawlers._basic._basic_crawler", + "name": "configuration", + "parsedDocstring": { + "text": "The `Configuration` instance. Some of its properties are used as defaults for the crawler." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 116 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Configuration", + "target": "235" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The event manager for managing events for the crawler and all its components." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 1527, + "module": "crawlers._basic._basic_crawler", + "name": "event_manager", + "parsedDocstring": { + "text": "The event manager for managing events for the crawler and all its components." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 119 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "EventManager", + "target": "1907" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The storage client for managing storages for the crawler and all its components." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 1528, + "module": "crawlers._basic._basic_crawler", + "name": "storage_client", + "parsedDocstring": { + "text": "The storage client for managing storages for the crawler and all its components." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 122 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "StorageClient", + "target": "2784" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Manager of requests that should be processed by the crawler." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 1529, + "module": "crawlers._basic._basic_crawler", + "name": "request_manager", + "parsedDocstring": { + "text": "Manager of requests that should be processed by the crawler." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 125 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "RequestManager", + "target": "2339" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A custom `SessionPool` instance, allowing the use of non-default configuration." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 1530, + "module": "crawlers._basic._basic_crawler", + "name": "session_pool", + "parsedDocstring": { + "text": "A custom `SessionPool` instance, allowing the use of non-default configuration." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 128 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "SessionPool", + "target": "2498" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP proxy configuration used when making requests." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 1531, + "module": "crawlers._basic._basic_crawler", + "name": "proxy_configuration", + "parsedDocstring": { + "text": "HTTP proxy configuration used when making requests." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 131 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "ProxyConfiguration", + "target": "263" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP client used by `BasicCrawlingContext.send_request` method." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 1532, + "module": "crawlers._basic._basic_crawler", + "name": "http_client", + "parsedDocstring": { + "text": "HTTP client used by `BasicCrawlingContext.send_request` method." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 134 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "HttpClient", + "target": "2070" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Specifies the maximum number of retries allowed for a request if its processing fails.\nThis includes retries due to navigation errors or errors thrown from user-supplied functions\n(`request_handler`, `pre_navigation_hooks` etc.).\n\nThis limit does not apply to retries triggered by session rotation (see `max_session_rotations`)." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 1533, + "module": "crawlers._basic._basic_crawler", + "name": "max_request_retries", + "parsedDocstring": { + "text": "Specifies the maximum number of retries allowed for a request if its processing fails.\nThis includes retries due to navigation errors or errors thrown from user-supplied functions\n(`request_handler`, `pre_navigation_hooks` etc.).\n\nThis limit does not apply to retries triggered by session rotation (see `max_session_rotations`)." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 137 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum number of pages to open during a crawl. The crawl stops upon reaching this limit.\nSetting this value can help avoid infinite loops in misconfigured crawlers. `None` means no limit.\nDue to concurrency settings, the actual number of pages visited may slightly exceed this value." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 1534, + "module": "crawlers._basic._basic_crawler", + "name": "max_requests_per_crawl", + "parsedDocstring": { + "text": "Maximum number of pages to open during a crawl. The crawl stops upon reaching this limit.\nSetting this value can help avoid infinite loops in misconfigured crawlers. `None` means no limit.\nDue to concurrency settings, the actual number of pages visited may slightly exceed this value." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 144 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum number of session rotations per request. The crawler rotates the session if a proxy error occurs\nor if the website blocks the request.\n\nThe session rotations are not counted towards the `max_request_retries` limit." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 1535, + "module": "crawlers._basic._basic_crawler", + "name": "max_session_rotations", + "parsedDocstring": { + "text": "Maximum number of session rotations per request. The crawler rotates the session if a proxy error occurs\nor if the website blocks the request.\n\nThe session rotations are not counted towards the `max_request_retries` limit." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 149 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Specifies the maximum crawl depth. If set, the crawler will stop processing links beyond this depth.\nThe crawl depth starts at 0 for initial requests and increases with each subsequent level of links.\nRequests at the maximum depth will still be processed, but no new links will be enqueued from those requests.\nIf not set, crawling continues without depth restrictions." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 1536, + "module": "crawlers._basic._basic_crawler", + "name": "max_crawl_depth", + "parsedDocstring": { + "text": "Specifies the maximum crawl depth. If set, the crawler will stop processing links beyond this depth.\nThe crawl depth starts at 0 for initial requests and increases with each subsequent level of links.\nRequests at the maximum depth will still be processed, but no new links will be enqueued from those requests.\nIf not set, crawling continues without depth restrictions." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 156 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Enable the use of a session pool for managing sessions during crawling." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 1537, + "module": "crawlers._basic._basic_crawler", + "name": "use_session_pool", + "parsedDocstring": { + "text": "Enable the use of a session pool for managing sessions during crawling." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 163 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, the crawler attempts to bypass bot protections automatically." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 1538, + "module": "crawlers._basic._basic_crawler", + "name": "retry_on_blocked", + "parsedDocstring": { + "text": "If True, the crawler attempts to bypass bot protections automatically." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 166 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Settings to fine-tune concurrency levels." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 1539, + "module": "crawlers._basic._basic_crawler", + "name": "concurrency_settings", + "parsedDocstring": { + "text": "Settings to fine-tune concurrency levels." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 169 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "ConcurrencySettings", + "target": "328" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum duration allowed for a single request handler to run." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 1540, + "module": "crawlers._basic._basic_crawler", + "name": "request_handler_timeout", + "parsedDocstring": { + "text": "Maximum duration allowed for a single request handler to run." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 172 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "timedelta" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, the crawler stops immediately when any request handler error occurs." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 1541, + "module": "crawlers._basic._basic_crawler", + "name": "abort_on_error", + "parsedDocstring": { + "text": "If True, the crawler stops immediately when any request handler error occurs." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 175 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, the crawler will set up logging infrastructure automatically." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 1542, + "module": "crawlers._basic._basic_crawler", + "name": "configure_logging", + "parsedDocstring": { + "text": "If True, the crawler will set up logging infrastructure automatically." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 178 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If 'table', displays crawler statistics as formatted tables in logs. If 'inline', outputs statistics as plain\ntext log messages." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 1543, + "module": "crawlers._basic._basic_crawler", + "name": "statistics_log_format", + "parsedDocstring": { + "text": "If 'table', displays crawler statistics as formatted tables in logs. If 'inline', outputs statistics as plain\ntext log messages." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 181 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Literal", + "typeArguments": [ + { + "type": "literal", + "value": "table" + }, + { + "type": "literal", + "value": "inline" + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Flag that can keep crawler running even when there are no requests in queue." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 1544, + "module": "crawlers._basic._basic_crawler", + "name": "keep_alive", + "parsedDocstring": { + "text": "Flag that can keep crawler running even when there are no requests in queue." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 186 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Additional HTTP status codes to treat as errors, triggering automatic retries when encountered." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 1545, + "module": "crawlers._basic._basic_crawler", + "name": "additional_http_error_status_codes", + "parsedDocstring": { + "text": "Additional HTTP status codes to treat as errors, triggering automatic retries when encountered." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 189 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Iterable", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP status codes that are typically considered errors but should be treated as successful responses." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 1546, + "module": "crawlers._basic._basic_crawler", + "name": "ignore_http_error_status_codes", + "parsedDocstring": { + "text": "HTTP status codes that are typically considered errors but should be treated as successful responses." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 192 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Iterable", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If set to `True`, the crawler will automatically try to fetch the robots.txt file for each domain,\nand skip those that are not allowed. This also prevents disallowed URLs to be added via `EnqueueLinksFunction`." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 1547, + "module": "crawlers._basic._basic_crawler", + "name": "respect_robots_txt_file", + "parsedDocstring": { + "text": "If set to `True`, the crawler will automatically try to fetch the robots.txt file for each domain,\nand skip those that are not allowed. This also prevents disallowed URLs to be added via `EnqueueLinksFunction`." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 203 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Interval for logging the crawler status messages." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 1548, + "module": "crawlers._basic._basic_crawler", + "name": "status_message_logging_interval", + "parsedDocstring": { + "text": "Interval for logging the crawler status messages." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 207 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "timedelta" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Allows overriding the default status message. The default status message is provided in the parameters.\nReturning `None` suppresses the status message." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 1549, + "module": "crawlers._basic._basic_crawler", + "name": "status_message_callback", + "parsedDocstring": { + "text": "Allows overriding the default status message. The default status message is provided in the parameters.\nReturning `None` suppresses the status message." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 210 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Callable", + "typeArguments": [ + { + "type": "reference", + "name": "[StatisticsState, StatisticsState | None, str]" + }, + { + "type": "reference", + "name": "Awaitable", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + ] + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Identifier used for crawler state tracking. Use the same id across multiple crawlers to share state between\nthem." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 1550, + "module": "crawlers._basic._basic_crawler", + "name": "id", + "parsedDocstring": { + "text": "Identifier used for crawler state tracking. Use the same id across multiple crawlers to share state between\nthem." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 216 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + } + ], + "type": { + "name": "AdaptivePlaywrightCrawler", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "ParsedHttpCrawlingContext", + "typeArguments": [ + { + "type": "reference", + "name": "BeautifulSoup" + } + ], + "target": "1252" + }, + { + "type": "reference", + "name": "BeautifulSoup" + }, + { + "type": "reference", + "name": "Tag" + } + ], + "target": "1395" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create `AdaptivePlaywrightCrawler` that uses `Parcel` for parsing static content." + } + ] + }, + "decorations": [ + { + "name": "classmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1414, + "module": "crawlers._adaptive_playwright._adaptive_playwright_crawler", + "name": "with_parsel_static_parser", + "parsedDocstring": { + "text": "Create `AdaptivePlaywrightCrawler` that uses `Parcel` for parsing static content." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 261 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create `AdaptivePlaywrightCrawler` that uses `Parcel` for parsing static content." + } + ] + }, + "flags": {}, + "id": 1415, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "with_parsel_static_parser", + "parameters": [ + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1416, + "kind": 32768, + "kindString": "Parameter", + "name": "rendering_type_predictor", + "type": { + "name": "RenderingTypePredictor | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "RenderingTypePredictor", + "target": "1320" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1417, + "kind": 32768, + "kindString": "Parameter", + "name": "result_checker", + "type": { + "name": "Callable[[RequestHandlerRunResult], bool] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Callable", + "typeArguments": [ + { + "type": "reference", + "name": "[RequestHandlerRunResult]" + }, + { + "type": "reference", + "name": "bool" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1418, + "kind": 32768, + "kindString": "Parameter", + "name": "result_comparator", + "type": { + "name": "Callable[[RequestHandlerRunResult, RequestHandlerRunResult], bool] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Callable", + "typeArguments": [ + { + "type": "reference", + "name": "[RequestHandlerRunResult, RequestHandlerRunResult]" + }, + { + "type": "reference", + "name": "bool" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1419, + "kind": 32768, + "kindString": "Parameter", + "name": "playwright_crawler_specific_kwargs", + "type": { + "name": "_PlaywrightCrawlerAdditionalOptions | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "_PlaywrightCrawlerAdditionalOptions", + "target": "1835" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1420, + "kind": 32768, + "kindString": "Parameter", + "name": "statistics", + "type": { + "name": "Statistics[StatisticsState] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Statistics", + "typeArguments": [ + { + "type": "reference", + "name": "StatisticsState", + "target": "2666" + } + ], + "target": "2707" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The `Configuration` instance. Some of its properties are used as defaults for the crawler." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 1526, + "module": "crawlers._basic._basic_crawler", + "name": "configuration", + "parsedDocstring": { + "text": "The `Configuration` instance. Some of its properties are used as defaults for the crawler." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 116 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Configuration", + "target": "235" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The event manager for managing events for the crawler and all its components." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 1527, + "module": "crawlers._basic._basic_crawler", + "name": "event_manager", + "parsedDocstring": { + "text": "The event manager for managing events for the crawler and all its components." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 119 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "EventManager", + "target": "1907" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The storage client for managing storages for the crawler and all its components." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 1528, + "module": "crawlers._basic._basic_crawler", + "name": "storage_client", + "parsedDocstring": { + "text": "The storage client for managing storages for the crawler and all its components." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 122 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "StorageClient", + "target": "2784" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Manager of requests that should be processed by the crawler." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 1529, + "module": "crawlers._basic._basic_crawler", + "name": "request_manager", + "parsedDocstring": { + "text": "Manager of requests that should be processed by the crawler." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 125 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "RequestManager", + "target": "2339" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A custom `SessionPool` instance, allowing the use of non-default configuration." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 1530, + "module": "crawlers._basic._basic_crawler", + "name": "session_pool", + "parsedDocstring": { + "text": "A custom `SessionPool` instance, allowing the use of non-default configuration." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 128 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "SessionPool", + "target": "2498" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP proxy configuration used when making requests." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 1531, + "module": "crawlers._basic._basic_crawler", + "name": "proxy_configuration", + "parsedDocstring": { + "text": "HTTP proxy configuration used when making requests." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 131 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "ProxyConfiguration", + "target": "263" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP client used by `BasicCrawlingContext.send_request` method." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 1532, + "module": "crawlers._basic._basic_crawler", + "name": "http_client", + "parsedDocstring": { + "text": "HTTP client used by `BasicCrawlingContext.send_request` method." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 134 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "HttpClient", + "target": "2070" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Specifies the maximum number of retries allowed for a request if its processing fails.\nThis includes retries due to navigation errors or errors thrown from user-supplied functions\n(`request_handler`, `pre_navigation_hooks` etc.).\n\nThis limit does not apply to retries triggered by session rotation (see `max_session_rotations`)." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 1533, + "module": "crawlers._basic._basic_crawler", + "name": "max_request_retries", + "parsedDocstring": { + "text": "Specifies the maximum number of retries allowed for a request if its processing fails.\nThis includes retries due to navigation errors or errors thrown from user-supplied functions\n(`request_handler`, `pre_navigation_hooks` etc.).\n\nThis limit does not apply to retries triggered by session rotation (see `max_session_rotations`)." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 137 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum number of pages to open during a crawl. The crawl stops upon reaching this limit.\nSetting this value can help avoid infinite loops in misconfigured crawlers. `None` means no limit.\nDue to concurrency settings, the actual number of pages visited may slightly exceed this value." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 1534, + "module": "crawlers._basic._basic_crawler", + "name": "max_requests_per_crawl", + "parsedDocstring": { + "text": "Maximum number of pages to open during a crawl. The crawl stops upon reaching this limit.\nSetting this value can help avoid infinite loops in misconfigured crawlers. `None` means no limit.\nDue to concurrency settings, the actual number of pages visited may slightly exceed this value." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 144 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum number of session rotations per request. The crawler rotates the session if a proxy error occurs\nor if the website blocks the request.\n\nThe session rotations are not counted towards the `max_request_retries` limit." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 1535, + "module": "crawlers._basic._basic_crawler", + "name": "max_session_rotations", + "parsedDocstring": { + "text": "Maximum number of session rotations per request. The crawler rotates the session if a proxy error occurs\nor if the website blocks the request.\n\nThe session rotations are not counted towards the `max_request_retries` limit." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 149 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Specifies the maximum crawl depth. If set, the crawler will stop processing links beyond this depth.\nThe crawl depth starts at 0 for initial requests and increases with each subsequent level of links.\nRequests at the maximum depth will still be processed, but no new links will be enqueued from those requests.\nIf not set, crawling continues without depth restrictions." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 1536, + "module": "crawlers._basic._basic_crawler", + "name": "max_crawl_depth", + "parsedDocstring": { + "text": "Specifies the maximum crawl depth. If set, the crawler will stop processing links beyond this depth.\nThe crawl depth starts at 0 for initial requests and increases with each subsequent level of links.\nRequests at the maximum depth will still be processed, but no new links will be enqueued from those requests.\nIf not set, crawling continues without depth restrictions." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 156 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Enable the use of a session pool for managing sessions during crawling." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 1537, + "module": "crawlers._basic._basic_crawler", + "name": "use_session_pool", + "parsedDocstring": { + "text": "Enable the use of a session pool for managing sessions during crawling." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 163 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, the crawler attempts to bypass bot protections automatically." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 1538, + "module": "crawlers._basic._basic_crawler", + "name": "retry_on_blocked", + "parsedDocstring": { + "text": "If True, the crawler attempts to bypass bot protections automatically." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 166 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Settings to fine-tune concurrency levels." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 1539, + "module": "crawlers._basic._basic_crawler", + "name": "concurrency_settings", + "parsedDocstring": { + "text": "Settings to fine-tune concurrency levels." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 169 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "ConcurrencySettings", + "target": "328" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum duration allowed for a single request handler to run." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 1540, + "module": "crawlers._basic._basic_crawler", + "name": "request_handler_timeout", + "parsedDocstring": { + "text": "Maximum duration allowed for a single request handler to run." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 172 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "timedelta" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, the crawler stops immediately when any request handler error occurs." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 1541, + "module": "crawlers._basic._basic_crawler", + "name": "abort_on_error", + "parsedDocstring": { + "text": "If True, the crawler stops immediately when any request handler error occurs." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 175 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, the crawler will set up logging infrastructure automatically." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 1542, + "module": "crawlers._basic._basic_crawler", + "name": "configure_logging", + "parsedDocstring": { + "text": "If True, the crawler will set up logging infrastructure automatically." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 178 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If 'table', displays crawler statistics as formatted tables in logs. If 'inline', outputs statistics as plain\ntext log messages." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 1543, + "module": "crawlers._basic._basic_crawler", + "name": "statistics_log_format", + "parsedDocstring": { + "text": "If 'table', displays crawler statistics as formatted tables in logs. If 'inline', outputs statistics as plain\ntext log messages." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 181 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Literal", + "typeArguments": [ + { + "type": "literal", + "value": "table" + }, + { + "type": "literal", + "value": "inline" + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Flag that can keep crawler running even when there are no requests in queue." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 1544, + "module": "crawlers._basic._basic_crawler", + "name": "keep_alive", + "parsedDocstring": { + "text": "Flag that can keep crawler running even when there are no requests in queue." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 186 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Additional HTTP status codes to treat as errors, triggering automatic retries when encountered." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 1545, + "module": "crawlers._basic._basic_crawler", + "name": "additional_http_error_status_codes", + "parsedDocstring": { + "text": "Additional HTTP status codes to treat as errors, triggering automatic retries when encountered." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 189 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Iterable", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP status codes that are typically considered errors but should be treated as successful responses." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 1546, + "module": "crawlers._basic._basic_crawler", + "name": "ignore_http_error_status_codes", + "parsedDocstring": { + "text": "HTTP status codes that are typically considered errors but should be treated as successful responses." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 192 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Iterable", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If set to `True`, the crawler will automatically try to fetch the robots.txt file for each domain,\nand skip those that are not allowed. This also prevents disallowed URLs to be added via `EnqueueLinksFunction`." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 1547, + "module": "crawlers._basic._basic_crawler", + "name": "respect_robots_txt_file", + "parsedDocstring": { + "text": "If set to `True`, the crawler will automatically try to fetch the robots.txt file for each domain,\nand skip those that are not allowed. This also prevents disallowed URLs to be added via `EnqueueLinksFunction`." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 203 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Interval for logging the crawler status messages." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 1548, + "module": "crawlers._basic._basic_crawler", + "name": "status_message_logging_interval", + "parsedDocstring": { + "text": "Interval for logging the crawler status messages." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 207 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "timedelta" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Allows overriding the default status message. The default status message is provided in the parameters.\nReturning `None` suppresses the status message." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 1549, + "module": "crawlers._basic._basic_crawler", + "name": "status_message_callback", + "parsedDocstring": { + "text": "Allows overriding the default status message. The default status message is provided in the parameters.\nReturning `None` suppresses the status message." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 210 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Callable", + "typeArguments": [ + { + "type": "reference", + "name": "[StatisticsState, StatisticsState | None, str]" + }, + { + "type": "reference", + "name": "Awaitable", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + ] + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Identifier used for crawler state tracking. Use the same id across multiple crawlers to share state between\nthem." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 1550, + "module": "crawlers._basic._basic_crawler", + "name": "id", + "parsedDocstring": { + "text": "Identifier used for crawler state tracking. Use the same id across multiple crawlers to share state between\nthem." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 216 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + } + ], + "type": { + "name": "AdaptivePlaywrightCrawler", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "ParsedHttpCrawlingContext", + "typeArguments": [ + { + "type": "reference", + "name": "Selector" + } + ], + "target": "1252" + }, + { + "type": "reference", + "name": "Selector" + }, + { + "type": "reference", + "name": "Selector" + } + ], + "target": "1395" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Pre navigation hooks for adaptive crawler are delegated to sub crawlers.\n\nOptionally parametrized decorator.\nHooks are wrapped in context that handles possibly missing `page` object by raising `AdaptiveContextError`." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1422, + "module": "crawlers._adaptive_playwright._adaptive_playwright_crawler", + "name": "pre_navigation_hook", + "parsedDocstring": { + "text": "Pre navigation hooks for adaptive crawler are delegated to sub crawlers.\n\nOptionally parametrized decorator.\nHooks are wrapped in context that handles possibly missing `page` object by raising `AdaptiveContextError`." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 437 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Pre navigation hooks for adaptive crawler are delegated to sub crawlers.\n\nOptionally parametrized decorator.\nHooks are wrapped in context that handles possibly missing `page` object by raising `AdaptiveContextError`." + } + ] + }, + "flags": {}, + "id": 1423, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "pre_navigation_hook", + "parameters": [ + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1424, + "kind": 32768, + "kindString": "Parameter", + "name": "hook", + "type": { + "name": "Callable[[AdaptivePlaywrightPreNavCrawlingContext], Awaitable[None]] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Callable", + "typeArguments": [ + { + "type": "reference", + "name": "[AdaptivePlaywrightPreNavCrawlingContext]" + }, + { + "type": "reference", + "name": "Awaitable", + "typeArguments": [ + { + "type": "literal", + "value": null + } + ] + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1425, + "kind": 32768, + "kindString": "Parameter", + "name": "playwright_only", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "Callable", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "[Callable[[AdaptivePlaywrightPreNavCrawlingContext], Awaitable[None]]]" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Post navigation hooks for adaptive crawler are delegated to sub crawlers.\n\nOptionally parametrized decorator.\nHooks are wrapped in context that handles possibly missing `page` and `response` objects by raising\n`AdaptiveContextError`." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1426, + "module": "crawlers._adaptive_playwright._adaptive_playwright_crawler", + "name": "post_navigation_hook", + "parsedDocstring": { + "text": "Post navigation hooks for adaptive crawler are delegated to sub crawlers.\n\nOptionally parametrized decorator.\nHooks are wrapped in context that handles possibly missing `page` and `response` objects by raising\n`AdaptiveContextError`." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 462 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Post navigation hooks for adaptive crawler are delegated to sub crawlers.\n\nOptionally parametrized decorator.\nHooks are wrapped in context that handles possibly missing `page` and `response` objects by raising\n`AdaptiveContextError`." + } + ] + }, + "flags": {}, + "id": 1427, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "post_navigation_hook", + "parameters": [ + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1428, + "kind": 32768, + "kindString": "Parameter", + "name": "hook", + "type": { + "name": "Callable[[AdaptivePlaywrightPostNavCrawlingContext], Awaitable[None]] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Callable", + "typeArguments": [ + { + "type": "reference", + "name": "[AdaptivePlaywrightPostNavCrawlingContext]" + }, + { + "type": "reference", + "name": "Awaitable", + "typeArguments": [ + { + "type": "literal", + "value": null + } + ] + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1429, + "kind": 32768, + "kindString": "Parameter", + "name": "playwright_only", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "Callable", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "[Callable[[AdaptivePlaywrightPostNavCrawlingContext], Awaitable[None]]]" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1430, + "module": "crawlers._adaptive_playwright._adaptive_playwright_crawler", + "name": "track_http_only_request_handler_runs", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 488 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 1431, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "track_http_only_request_handler_runs", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1432, + "module": "crawlers._adaptive_playwright._adaptive_playwright_crawler", + "name": "track_browser_request_handler_runs", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 491 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 1433, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "track_browser_request_handler_runs", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1434, + "module": "crawlers._adaptive_playwright._adaptive_playwright_crawler", + "name": "track_rendering_type_mispredictions", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 494 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 1435, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "track_rendering_type_mispredictions", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The logger used by the crawler." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 4105, + "module": "crawlers._basic._basic_crawler", + "name": "log", + "parsedDocstring": { + "text": "The logger used by the crawler." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 511 + } + ], + "type": { + "name": "logging.Logger", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.log", + "target": 1588, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The `Router` used to handle each individual crawling request." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 4106, + "module": "crawlers._basic._basic_crawler", + "name": "router", + "parsedDocstring": { + "text": "The `Router` used to handle each individual crawling request." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 516 + } + ], + "type": { + "name": "Router[TCrawlingContext]", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.router", + "target": 1589, + "type": "reference" + }, + "overwrites": { + "name": "BasicCrawler.router", + "target": 1590, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Statistics about the current (or last) crawler run." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 4107, + "module": "crawlers._basic._basic_crawler", + "name": "statistics", + "parsedDocstring": { + "text": "Statistics about the current (or last) crawler run." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 531 + } + ], + "type": { + "name": "Statistics[TStatisticsState]", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.statistics", + "target": 1593, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Set flag to stop crawler.\n\nThis stops current crawler run regardless of whether all requests were finished.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4108, + "module": "crawlers._basic._basic_crawler", + "name": "stop", + "parsedDocstring": { + "text": "Set flag to stop crawler.\n\nThis stops current crawler run regardless of whether all requests were finished.\n", + "args": { + "reason": "Reason for stopping that will be used in logs." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 535 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Set flag to stop crawler.\n\nThis stops current crawler run regardless of whether all requests were finished.\n" + } + ] + }, + "flags": {}, + "id": 1595, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "stop", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Reason for stopping that will be used in logs." + } + ] + }, + "defaultValue": "'Stop was called externally.'", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1596, + "kind": 32768, + "kindString": "Parameter", + "name": "reason", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.stop", + "target": 1594, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.stop", + "target": 1594, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the configured request manager. If none is configured, open and return the default request queue." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4109, + "module": "crawlers._basic._basic_crawler", + "name": "get_request_manager", + "parsedDocstring": { + "text": "Return the configured request manager. If none is configured, open and return the default request queue." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 613 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the configured request manager. If none is configured, open and return the default request queue." + } + ] + }, + "flags": {}, + "id": 1598, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_request_manager", + "parameters": [], + "type": { + "name": "RequestManager", + "type": "reference", + "target": "2339" + }, + "inheritedFrom": { + "name": "BasicCrawler.get_request_manager", + "target": 1597, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.get_request_manager", + "target": 1597, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the `Dataset` with the given ID or name. If none is provided, return the default one." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4110, + "module": "crawlers._basic._basic_crawler", + "name": "get_dataset", + "parsedDocstring": { + "text": "Return the `Dataset` with the given ID or name. If none is provided, return the default one." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 623 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the `Dataset` with the given ID or name. If none is provided, return the default one." + } + ] + }, + "flags": {}, + "id": 1600, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_dataset", + "parameters": [ + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1601, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1602, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1603, + "kind": 32768, + "kindString": "Parameter", + "name": "alias", + "type": { + "name": "str | None", + "type": "reference" + } + } + ], + "type": { + "name": "Dataset", + "type": "reference", + "target": "3766" + }, + "inheritedFrom": { + "name": "BasicCrawler.get_dataset", + "target": 1599, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.get_dataset", + "target": 1599, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the `KeyValueStore` with the given ID or name. If none is provided, return the default KVS." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4111, + "module": "crawlers._basic._basic_crawler", + "name": "get_key_value_store", + "parsedDocstring": { + "text": "Return the `KeyValueStore` with the given ID or name. If none is provided, return the default KVS." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 639 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the `KeyValueStore` with the given ID or name. If none is provided, return the default KVS." + } + ] + }, + "flags": {}, + "id": 1605, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_key_value_store", + "parameters": [ + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1606, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1607, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1608, + "kind": 32768, + "kindString": "Parameter", + "name": "alias", + "type": { + "name": "str | None", + "type": "reference" + } + } + ], + "type": { + "name": "KeyValueStore", + "type": "reference", + "target": "3700" + }, + "inheritedFrom": { + "name": "BasicCrawler.get_key_value_store", + "target": 1604, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.get_key_value_store", + "target": 1604, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a function to handle errors occurring in request handlers.\n\nThe error handler is invoked after a request handler error occurs and before a retry attempt." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4112, + "module": "crawlers._basic._basic_crawler", + "name": "error_handler", + "parsedDocstring": { + "text": "Register a function to handle errors occurring in request handlers.\n\nThe error handler is invoked after a request handler error occurs and before a retry attempt." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 655 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a function to handle errors occurring in request handlers.\n\nThe error handler is invoked after a request handler error occurs and before a retry attempt." + } + ] + }, + "flags": {}, + "id": 1610, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "error_handler", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1611, + "kind": 32768, + "kindString": "Parameter", + "name": "handler", + "type": { + "name": "ErrorHandler[TCrawlingContext | BasicCrawlingContext]", + "type": "reference" + } + } + ], + "type": { + "name": "ErrorHandler[TCrawlingContext]", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.error_handler", + "target": 1609, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.error_handler", + "target": 1609, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a function to handle requests that exceed the maximum retry limit.\n\nThe failed request handler is invoked when a request has failed all retry attempts." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4113, + "module": "crawlers._basic._basic_crawler", + "name": "failed_request_handler", + "parsedDocstring": { + "text": "Register a function to handle requests that exceed the maximum retry limit.\n\nThe failed request handler is invoked when a request has failed all retry attempts." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 665 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a function to handle requests that exceed the maximum retry limit.\n\nThe failed request handler is invoked when a request has failed all retry attempts." + } + ] + }, + "flags": {}, + "id": 1613, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "failed_request_handler", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1614, + "kind": 32768, + "kindString": "Parameter", + "name": "handler", + "type": { + "name": "FailedRequestHandler[TCrawlingContext | BasicCrawlingContext]", + "type": "reference" + } + } + ], + "type": { + "name": "FailedRequestHandler[TCrawlingContext]", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.failed_request_handler", + "target": 1612, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.failed_request_handler", + "target": 1612, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a function to handle skipped requests.\n\nThe skipped request handler is invoked when a request is skipped due to a collision or other reasons." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4114, + "module": "crawlers._basic._basic_crawler", + "name": "on_skipped_request", + "parsedDocstring": { + "text": "Register a function to handle skipped requests.\n\nThe skipped request handler is invoked when a request is skipped due to a collision or other reasons." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 675 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a function to handle skipped requests.\n\nThe skipped request handler is invoked when a request is skipped due to a collision or other reasons." + } + ] + }, + "flags": {}, + "id": 1616, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "on_skipped_request", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1617, + "kind": 32768, + "kindString": "Parameter", + "name": "callback", + "type": { + "name": "SkippedRequestCallback", + "type": "reference", + "target": "1524" + } + } + ], + "type": { + "name": "SkippedRequestCallback", + "type": "reference", + "target": "1524" + }, + "inheritedFrom": { + "name": "BasicCrawler.on_skipped_request", + "target": 1615, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.on_skipped_request", + "target": 1615, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Run the crawler until all requests are processed.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4115, + "module": "crawlers._basic._basic_crawler", + "name": "run", + "parsedDocstring": { + "text": "Run the crawler until all requests are processed.\n", + "args": { + "requests": "The requests to be enqueued before the crawler starts.", + "purge_request_queue": "If this is `True` and the crawler is not being run for the first time, the default\nrequest queue will be purged." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 683 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Run the crawler until all requests are processed.\n" + } + ] + }, + "flags": {}, + "id": 1619, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "run", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The requests to be enqueued before the crawler starts." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1620, + "kind": 32768, + "kindString": "Parameter", + "name": "requests", + "type": { + "name": "Sequence[str | Request] | None", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "If this is `True` and the crawler is not being run for the first time, the default\nrequest queue will be purged." + } + ] + }, + "defaultValue": "True", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1621, + "kind": 32768, + "kindString": "Parameter", + "name": "purge_request_queue", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "FinalStatistics", + "type": "reference", + "target": "2649" + }, + "inheritedFrom": { + "name": "BasicCrawler.run", + "target": 1618, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.run", + "target": 1618, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Add requests to the underlying request manager in batches.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4116, + "module": "crawlers._basic._basic_crawler", + "name": "add_requests", + "parsedDocstring": { + "text": "Add requests to the underlying request manager in batches.\n", + "args": { + "requests": "A list of requests to add to the queue.", + "forefront": "If True, add requests to the forefront of the queue.", + "batch_size": "The number of requests to add in one batch.", + "wait_time_between_batches": "Time to wait between adding batches.", + "wait_for_all_requests_to_be_added": "If True, wait for all requests to be added before returning.", + "wait_for_all_requests_to_be_added_timeout": "Timeout for waiting for all requests to be added." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 795 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Add requests to the underlying request manager in batches.\n" + } + ] + }, + "flags": {}, + "id": 1623, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "add_requests", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A list of requests to add to the queue." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1624, + "kind": 32768, + "kindString": "Parameter", + "name": "requests", + "type": { + "name": "Sequence[str | Request]", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, add requests to the forefront of the queue." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1625, + "kind": 32768, + "kindString": "Parameter", + "name": "forefront", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The number of requests to add in one batch." + } + ] + }, + "defaultValue": "1000", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1626, + "kind": 32768, + "kindString": "Parameter", + "name": "batch_size", + "type": { + "name": "int", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Time to wait between adding batches." + } + ] + }, + "defaultValue": "timedelta(0)", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1627, + "kind": 32768, + "kindString": "Parameter", + "name": "wait_time_between_batches", + "type": { + "name": "timedelta", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, wait for all requests to be added before returning." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1628, + "kind": 32768, + "kindString": "Parameter", + "name": "wait_for_all_requests_to_be_added", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Timeout for waiting for all requests to be added." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1629, + "kind": 32768, + "kindString": "Parameter", + "name": "wait_for_all_requests_to_be_added_timeout", + "type": { + "name": "timedelta | None", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.add_requests", + "target": 1622, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.add_requests", + "target": 1622, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4117, + "module": "crawlers._basic._basic_crawler", + "name": "use_state", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 843 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 1631, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "use_state", + "parameters": [ + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1632, + "kind": 32768, + "kindString": "Parameter", + "name": "default_value", + "type": { + "name": "dict[str, JsonSerializable] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "dict", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "JsonSerializable" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "dict", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "JsonSerializable" + } + ] + }, + "inheritedFrom": { + "name": "BasicCrawler.use_state", + "target": 1630, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.use_state", + "target": 1630, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Retrieve data from a `Dataset`.\n\nThis helper method simplifies the process of retrieving data from a `Dataset`. It opens the specified\none and then retrieves the data based on the provided parameters.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4118, + "module": "crawlers._basic._basic_crawler", + "name": "get_data", + "parsedDocstring": { + "text": "Retrieve data from a `Dataset`.\n\nThis helper method simplifies the process of retrieving data from a `Dataset`. It opens the specified\none and then retrieves the data based on the provided parameters.\n", + "args": { + "dataset_id": "The ID of the `Dataset`.", + "dataset_name": "The name of the `Dataset` (global scope, named storage).", + "dataset_alias": "The alias of the `Dataset` (run scope, unnamed storage).", + "kwargs": "Keyword arguments to be passed to the `Dataset.get_data()` method.\n" + }, + "returns": "The retrieved data." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 854 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "The retrieved data." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Retrieve data from a `Dataset`.\n\nThis helper method simplifies the process of retrieving data from a `Dataset`. It opens the specified\none and then retrieves the data based on the provided parameters.\n" + } + ] + }, + "flags": {}, + "id": 1634, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_data", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The ID of the `Dataset`." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1635, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_id", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The name of the `Dataset` (global scope, named storage)." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1636, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_name", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The alias of the `Dataset` (run scope, unnamed storage)." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1637, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_alias", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Keyword arguments to be passed to the `Dataset.get_data()` method.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1638, + "kind": 32768, + "kindString": "Parameter", + "name": "kwargs", + "type": { + "name": "Unpack[GetDataKwargs]", + "type": "reference" + } + } + ], + "type": { + "name": "DatasetItemsListPage", + "type": "reference", + "target": "3658" + }, + "inheritedFrom": { + "name": "BasicCrawler.get_data", + "target": 1633, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.get_data", + "target": 1633, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Export all items from a Dataset to a JSON or CSV file.\n\nThis method simplifies the process of exporting data collected during crawling. It automatically\ndetermines the export format based on the file extension (`.json` or `.csv`) and handles\nthe conversion of `Dataset` items to the appropriate format.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4119, + "module": "crawlers._basic._basic_crawler", + "name": "export_data", + "parsedDocstring": { + "text": "Export all items from a Dataset to a JSON or CSV file.\n\nThis method simplifies the process of exporting data collected during crawling. It automatically\ndetermines the export format based on the file extension (`.json` or `.csv`) and handles\nthe conversion of `Dataset` items to the appropriate format.\n", + "args": { + "path": "The destination file path. Must end with '.json' or '.csv'.", + "dataset_id": "The ID of the Dataset to export from.", + "dataset_name": "The name of the Dataset to export from (global scope, named storage).", + "dataset_alias": "The alias of the Dataset to export from (run scope, unnamed storage).", + "additional_kwargs": "Extra keyword arguments forwarded to the JSON/CSV exporter depending on the file format." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 884 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Export all items from a Dataset to a JSON or CSV file.\n\nThis method simplifies the process of exporting data collected during crawling. It automatically\ndetermines the export format based on the file extension (`.json` or `.csv`) and handles\nthe conversion of `Dataset` items to the appropriate format.\n" + } + ] + }, + "flags": {}, + "id": 1640, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "export_data", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The destination file path. Must end with '.json' or '.csv'." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1641, + "kind": 32768, + "kindString": "Parameter", + "name": "path", + "type": { + "name": "str | Path", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The ID of the Dataset to export from." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1642, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_id", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The name of the Dataset to export from (global scope, named storage)." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1643, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_name", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The alias of the Dataset to export from (run scope, unnamed storage)." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1644, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_alias", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Extra keyword arguments forwarded to the JSON/CSV exporter depending on the file format." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1645, + "kind": 32768, + "kindString": "Parameter", + "name": "additional_kwargs", + "type": { + "name": "Unpack[ExportDataJsonKwargs | ExportDataCsvKwargs]", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.export_data", + "target": 1639, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.export_data", + "target": 1639, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "An adaptive web crawler capable of using both static HTTP request based crawling and browser based crawling.\n\nIt uses a more limited crawling context interface so that it is able to switch to HTTP-only crawling when it detects\nthat it may bring a performance benefit.\nIt uses specific implementation of `AbstractHttpCrawler` and `PlaywrightCrawler`.\n\n### Usage\n```python\nfrom crawlee.crawlers import AdaptivePlaywrightCrawler, AdaptivePlaywrightCrawlingContext\n\ncrawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser(\n max_requests_per_crawl=10, # Limit the max requests per crawl.\n playwright_crawler_specific_kwargs={'browser_type': 'chromium'},\n)\n\n@crawler.router.default_handler\nasync def request_handler_for_label(context: AdaptivePlaywrightCrawlingContext) -> None:\n # Do some processing using `parsed_content`\n context.log.info(context.parsed_content.title)\n\n # Locate element h2 within 5 seconds\n h2 = await context.query_selector_one('h2', timedelta(milliseconds=5000))\n # Do stuff with element found by the selector\n context.log.info(h2)\n\n # Find more links and enqueue them.\n await context.enqueue_links()\n # Save some data.\n await context.push_data({'Visited url': context.request.url})\n\nawait crawler.run(['https://crawlee.dev/'])\n```" + } + ] + }, + "decorations": [ + { + "args": "('Crawlers')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 1396, + 4116, + 4112, + 4119, + 4113, + 4118, + 4110, + 4111, + 4109, + 4114, + 1426, + 1422, + 4115, + 4108, + 1432, + 1430, + 1434, + 4117, + 1405, + 1414 + ], + "title": "Methods" + }, + { + "children": [ + 4105, + 4106, + 4107 + ], + "title": "Properties" + } + ], + "id": 1395, + "module": "crawlers._adaptive_playwright._adaptive_playwright_crawler", + "name": "AdaptivePlaywrightCrawler", + "parsedDocstring": { + "text": "An adaptive web crawler capable of using both static HTTP request based crawling and browser based crawling.\n\nIt uses a more limited crawling context interface so that it is able to switch to HTTP-only crawling when it detects\nthat it may bring a performance benefit.\nIt uses specific implementation of `AbstractHttpCrawler` and `PlaywrightCrawler`.\n\n### Usage\n```python\nfrom crawlee.crawlers import AdaptivePlaywrightCrawler, AdaptivePlaywrightCrawlingContext\n\ncrawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser(\n max_requests_per_crawl=10, # Limit the max requests per crawl.\n playwright_crawler_specific_kwargs={'browser_type': 'chromium'},\n)\n\n@crawler.router.default_handler\nasync def request_handler_for_label(context: AdaptivePlaywrightCrawlingContext) -> None:\n # Do some processing using `parsed_content`\n context.log.info(context.parsed_content.title)\n\n # Locate element h2 within 5 seconds\n h2 = await context.query_selector_one('h2', timedelta(milliseconds=5000))\n # Do stuff with element found by the selector\n context.log.info(h2)\n\n # Find more links and enqueue them.\n await context.enqueue_links()\n # Save some data.\n await context.push_data({'Visited url': context.request.url})\n\nawait crawler.run(['https://crawlee.dev/'])\n```" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 81 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "BasicCrawler", + "target": "1555", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1437, + "module": "crawlers._adaptive_playwright._adaptive_playwright_crawler", + "name": "result", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 500 + } + ], + "type": { + "name": "RequestHandlerRunResult | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "RequestHandlerRunResult", + "target": "394" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1438, + "module": "crawlers._adaptive_playwright._adaptive_playwright_crawler", + "name": "exception", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 501 + } + ], + "type": { + "name": "Exception | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Exception" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "args": "(frozen=True)", + "name": "dataclass" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 1438, + 1437 + ], + "title": "Properties" + } + ], + "id": 1436, + "module": "crawlers._adaptive_playwright._adaptive_playwright_crawler", + "name": "SubCrawlerRun", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 499 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1439, + "module": "crawlers._adaptive_playwright._adaptive_playwright_crawling_context", + "name": "TStaticParseResult", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 25 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1440, + "module": "crawlers._adaptive_playwright._adaptive_playwright_crawling_context", + "name": "TStaticSelectResult", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 26 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1441, + "module": "crawlers._adaptive_playwright._adaptive_playwright_crawling_context", + "name": "AdaptiveContextError", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 29 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The Playwright `Page` object for the current page.\n\nRaises `AdaptiveContextError` if accessed during static crawling." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 1443, + "module": "crawlers._adaptive_playwright._adaptive_playwright_crawling_context", + "name": "page", + "parsedDocstring": { + "text": "The Playwright `Page` object for the current page.\n\nRaises `AdaptiveContextError` if accessed during static crawling." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 50 + } + ], + "type": { + "name": "Page", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A function to perform infinite scrolling on the page.\n\nThis scrolls to the bottom, triggering the loading of additional content if present.\nRaises `AdaptiveContextError` if accessed during static crawling." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 1444, + "module": "crawlers._adaptive_playwright._adaptive_playwright_crawling_context", + "name": "infinite_scroll", + "parsedDocstring": { + "text": "A function to perform infinite scrolling on the page.\n\nThis scrolls to the bottom, triggering the loading of additional content if present.\nRaises `AdaptiveContextError` if accessed during static crawling." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 60 + } + ], + "type": { + "name": "Callable", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "[]" + }, + { + "type": "reference", + "name": "Awaitable", + "typeArguments": [ + { + "type": "literal", + "value": null + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The Playwright `Response` object containing the response details for the current URL.\n\nRaises `AdaptiveContextError` if accessed during static crawling." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 1445, + "module": "crawlers._adaptive_playwright._adaptive_playwright_crawling_context", + "name": "response", + "parsedDocstring": { + "text": "The Playwright `Response` object containing the response details for the current URL.\n\nRaises `AdaptiveContextError` if accessed during static crawling." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 71 + } + ], + "type": { + "name": "Response", + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Locate element by css selector and return `None` once it is found.\n\nIf element is not found within timeout, `TimeoutError` is raised.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1446, + "module": "crawlers._adaptive_playwright._adaptive_playwright_crawling_context", + "name": "wait_for_selector", + "parsedDocstring": { + "text": "Locate element by css selector and return `None` once it is found.\n\nIf element is not found within timeout, `TimeoutError` is raised.\n", + "args": { + "selector": "Css selector to be used to locate specific element on page.", + "timeout": "Timeout that defines how long the function wait for the selector to appear." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 80 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Locate element by css selector and return `None` once it is found.\n\nIf element is not found within timeout, `TimeoutError` is raised.\n" + } + ] + }, + "flags": {}, + "id": 1447, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "wait_for_selector", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Css selector to be used to locate specific element on page." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1448, + "kind": 32768, + "kindString": "Parameter", + "name": "selector", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Timeout that defines how long the function wait for the selector to appear." + } + ] + }, + "defaultValue": "timedelta(seconds=5)", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1449, + "kind": 32768, + "kindString": "Parameter", + "name": "timeout", + "type": { + "name": "timedelta", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Locate element by css selector and return first element found.\n\nIf element is not found within timeout, `TimeoutError` is raised.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1450, + "module": "crawlers._adaptive_playwright._adaptive_playwright_crawling_context", + "name": "query_selector_one", + "parsedDocstring": { + "text": "Locate element by css selector and return first element found.\n\nIf element is not found within timeout, `TimeoutError` is raised.\n", + "args": { + "selector": "Css selector to be used to locate specific element on page.", + "timeout": "Timeout that defines how long the function wait for the selector to appear.\n" + }, + "returns": "Result of used static parser `select` method." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 93 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "Result of used static parser `select` method." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Locate element by css selector and return first element found.\n\nIf element is not found within timeout, `TimeoutError` is raised.\n" + } + ] + }, + "flags": {}, + "id": 1451, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "query_selector_one", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Css selector to be used to locate specific element on page." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1452, + "kind": 32768, + "kindString": "Parameter", + "name": "selector", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Timeout that defines how long the function wait for the selector to appear.\n" + } + ] + }, + "defaultValue": "timedelta(seconds=5)", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1453, + "kind": 32768, + "kindString": "Parameter", + "name": "timeout", + "type": { + "name": "timedelta", + "type": "reference" + } + } + ], + "type": { + "name": "TStaticSelectResult | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "TStaticSelectResult", + "target": "1383" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Locate element by css selector and return all elements found.\n\nIf element is not found within timeout, `TimeoutError` is raised.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1454, + "module": "crawlers._adaptive_playwright._adaptive_playwright_crawling_context", + "name": "query_selector_all", + "parsedDocstring": { + "text": "Locate element by css selector and return all elements found.\n\nIf element is not found within timeout, `TimeoutError` is raised.\n", + "args": { + "selector": "Css selector to be used to locate specific element on page.", + "timeout": "Timeout that defines how long the function wait for the selector to appear.\n" + }, + "returns": "List of results of used static parser `select` method." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 111 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "List of results of used static parser `select` method." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Locate element by css selector and return all elements found.\n\nIf element is not found within timeout, `TimeoutError` is raised.\n" + } + ] + }, + "flags": {}, + "id": 1455, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "query_selector_all", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Css selector to be used to locate specific element on page." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1456, + "kind": 32768, + "kindString": "Parameter", + "name": "selector", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Timeout that defines how long the function wait for the selector to appear.\n" + } + ] + }, + "defaultValue": "timedelta(seconds=5)", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1457, + "kind": 32768, + "kindString": "Parameter", + "name": "timeout", + "type": { + "name": "timedelta", + "type": "reference" + } + } + ], + "type": { + "name": "Sequence", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "TStaticSelectResult", + "target": "1383" + } + ] + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Parse whole page with static parser. If `selector` argument is used, wait for selector first.\n\nIf element is not found within timeout, TimeoutError is raised.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1458, + "module": "crawlers._adaptive_playwright._adaptive_playwright_crawling_context", + "name": "parse_with_static_parser", + "parsedDocstring": { + "text": "Parse whole page with static parser. If `selector` argument is used, wait for selector first.\n\nIf element is not found within timeout, TimeoutError is raised.\n", + "args": { + "selector": "css selector to be used to locate specific element on page.", + "timeout": "timeout that defines how long the function wait for the selector to appear.\n" + }, + "returns": "Result of used static parser `parse_text` method." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 149 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "Result of used static parser `parse_text` method." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Parse whole page with static parser. If `selector` argument is used, wait for selector first.\n\nIf element is not found within timeout, TimeoutError is raised.\n" + } + ] + }, + "flags": {}, + "id": 1459, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "parse_with_static_parser", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "css selector to be used to locate specific element on page." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1460, + "kind": 32768, + "kindString": "Parameter", + "name": "selector", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "timeout that defines how long the function wait for the selector to appear.\n" + } + ] + }, + "defaultValue": "timedelta(seconds=5)", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1461, + "kind": 32768, + "kindString": "Parameter", + "name": "timeout", + "type": { + "name": "timedelta", + "type": "reference" + } + } + ], + "type": { + "name": "TStaticParseResult", + "type": "reference", + "target": "1382" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance from an existing `ParsedHttpCrawlingContext`." + } + ] + }, + "decorations": [ + { + "name": "classmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1462, + "module": "crawlers._adaptive_playwright._adaptive_playwright_crawling_context", + "name": "from_parsed_http_crawling_context", + "parsedDocstring": { + "text": "Initialize a new instance from an existing `ParsedHttpCrawlingContext`." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 170 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance from an existing `ParsedHttpCrawlingContext`." + } + ] + }, + "flags": {}, + "id": 1463, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "from_parsed_http_crawling_context", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1464, + "kind": 32768, + "kindString": "Parameter", + "name": "context", + "type": { + "name": "ParsedHttpCrawlingContext", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "TStaticParseResult", + "target": "1382" + } + ], + "target": "1252" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1465, + "kind": 32768, + "kindString": "Parameter", + "name": "parser", + "type": { + "name": "AbstractHttpParser", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "TStaticParseResult", + "target": "1382" + }, + { + "type": "reference", + "name": "TStaticSelectResult", + "target": "1383" + } + ], + "target": "1281" + } + } + ], + "type": { + "name": "AdaptivePlaywrightCrawlingContext", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "TStaticParseResult", + "target": "1382" + }, + { + "type": "reference", + "name": "TStaticSelectResult", + "target": "1383" + } + ], + "target": "1442" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance from an existing `PlaywrightCrawlingContext`." + } + ] + }, + "decorations": [ + { + "name": "classmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1466, + "module": "crawlers._adaptive_playwright._adaptive_playwright_crawling_context", + "name": "from_playwright_crawling_context", + "parsedDocstring": { + "text": "Initialize a new instance from an existing `PlaywrightCrawlingContext`." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 179 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance from an existing `PlaywrightCrawlingContext`." + } + ] + }, + "flags": {}, + "id": 1467, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "from_playwright_crawling_context", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1468, + "kind": 32768, + "kindString": "Parameter", + "name": "context", + "type": { + "name": "PlaywrightCrawlingContext", + "type": "reference", + "target": "1842" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1469, + "kind": 32768, + "kindString": "Parameter", + "name": "parser", + "type": { + "name": "AbstractHttpParser", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "TStaticParseResult", + "target": "1382" + }, + { + "type": "reference", + "name": "TStaticSelectResult", + "target": "1383" + } + ], + "target": "1281" + } + } + ], + "type": { + "name": "AdaptivePlaywrightCrawlingContext", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "TStaticParseResult", + "target": "1382" + }, + { + "type": "reference", + "name": "TStaticSelectResult", + "target": "1383" + } + ], + "target": "1442" + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 4337, + "module": "crawlers._abstract_http._http_crawling_context", + "name": "parsed_content", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 40 + } + ], + "type": { + "name": "TParseResult", + "type": "reference", + "target": "1243" + }, + "inheritedFrom": { + "name": "ParsedHttpCrawlingContext.parsed_content", + "target": 1253, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 4338, + "module": "crawlers._abstract_http._http_crawling_context", + "name": "enqueue_links", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 41 + } + ], + "type": { + "name": "EnqueueLinksFunction", + "type": "reference", + "target": "430" + }, + "inheritedFrom": { + "name": "ParsedHttpCrawlingContext.enqueue_links", + "target": 1254, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 4339, + "module": "crawlers._abstract_http._http_crawling_context", + "name": "extract_links", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 42 + } + ], + "type": { + "name": "ExtractLinksFunction", + "type": "reference", + "target": "459" + }, + "inheritedFrom": { + "name": "ParsedHttpCrawlingContext.extract_links", + "target": 1255, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance from an existing `HttpCrawlingContext`." + } + ] + }, + "decorations": [ + { + "name": "classmethod" + } + ], + "flags": {}, + "groups": [], + "id": 4340, + "module": "crawlers._abstract_http._http_crawling_context", + "name": "from_http_crawling_context", + "parsedDocstring": { + "text": "Initialize a new instance from an existing `HttpCrawlingContext`." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 45 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance from an existing `HttpCrawlingContext`." + } + ] + }, + "flags": {}, + "id": 1257, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "from_http_crawling_context", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1258, + "kind": 32768, + "kindString": "Parameter", + "name": "context", + "type": { + "name": "HttpCrawlingContext", + "type": "reference", + "target": "1245" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1259, + "kind": 32768, + "kindString": "Parameter", + "name": "parsed_content", + "type": { + "name": "TParseResult", + "type": "reference", + "target": "1243" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1260, + "kind": 32768, + "kindString": "Parameter", + "name": "enqueue_links", + "type": { + "name": "EnqueueLinksFunction", + "type": "reference", + "target": "430" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1261, + "kind": 32768, + "kindString": "Parameter", + "name": "extract_links", + "type": { + "name": "ExtractLinksFunction", + "type": "reference", + "target": "459" + } + } + ], + "type": { + "name": "Self", + "type": "reference" + }, + "inheritedFrom": { + "name": "ParsedHttpCrawlingContext.from_http_crawling_context", + "target": 1256, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "ParsedHttpCrawlingContext.from_http_crawling_context", + "target": 1256, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance from an existing `BasicCrawlingContext`." + } + ] + }, + "decorations": [ + { + "name": "classmethod" + } + ], + "flags": {}, + "groups": [], + "id": 4341, + "module": "crawlers._abstract_http._http_crawling_context", + "name": "from_basic_crawling_context", + "parsedDocstring": { + "text": "Initialize a new instance from an existing `BasicCrawlingContext`." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 22 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance from an existing `BasicCrawlingContext`." + } + ] + }, + "flags": {}, + "id": 1247, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "from_basic_crawling_context", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1248, + "kind": 32768, + "kindString": "Parameter", + "name": "context", + "type": { + "name": "BasicCrawlingContext", + "type": "reference", + "target": "504" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1249, + "kind": 32768, + "kindString": "Parameter", + "name": "http_response", + "type": { + "name": "HttpResponse", + "type": "reference", + "target": "2060" + } + } + ], + "type": { + "name": "Self", + "type": "reference" + }, + "inheritedFrom": { + "name": "HttpCrawlingContext.from_basic_crawling_context", + "target": 1246, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "HttpCrawlingContext.from_basic_crawling_context", + "target": 1246, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get snapshot of crawled page." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4342, + "module": "crawlers._abstract_http._http_crawling_context", + "name": "get_snapshot", + "parsedDocstring": { + "text": "Get snapshot of crawled page." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 27 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get snapshot of crawled page." + } + ] + }, + "flags": {}, + "id": 1251, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_snapshot", + "parameters": [], + "type": { + "name": "PageSnapshot", + "type": "reference", + "target": "495" + }, + "overwrites": { + "name": "BasicCrawlingContext.get_snapshot", + "target": 514, + "type": "reference" + }, + "inheritedFrom": { + "name": "HttpCrawlingContext.get_snapshot", + "target": 1250, + "type": "reference" + } + } + ], + "overwrites": { + "name": "BasicCrawlingContext.get_snapshot", + "target": 514, + "type": "reference" + }, + "inheritedFrom": { + "name": "HttpCrawlingContext.get_snapshot", + "target": 1250, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The HTTP response received from the server." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4343, + "module": "http_clients._base", + "name": "http_response", + "parsedDocstring": { + "text": "The HTTP response received from the server." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/http_clients/_base.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 70 + } + ], + "type": { + "name": "HttpResponse", + "type": "reference", + "target": "2060" + }, + "inheritedFrom": { + "name": "HttpCrawlingResult.http_response", + "target": 2069, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Request object for the current page being processed." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4344, + "module": "_types", + "name": "request", + "parsedDocstring": { + "text": "Request object for the current page being processed." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 637 + } + ], + "type": { + "name": "Request", + "type": "reference", + "target": "150" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.request", + "target": 505, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Session object for the current page being processed." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4345, + "module": "_types", + "name": "session", + "parsedDocstring": { + "text": "Session object for the current page being processed." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 640 + } + ], + "type": { + "name": "Session | None", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.session", + "target": 506, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Proxy information for the current page being processed." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4346, + "module": "_types", + "name": "proxy_info", + "parsedDocstring": { + "text": "Proxy information for the current page being processed." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 643 + } + ], + "type": { + "name": "ProxyInfo | None", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.proxy_info", + "target": 507, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Send request crawling context helper function." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4347, + "module": "_types", + "name": "send_request", + "parsedDocstring": { + "text": "Send request crawling context helper function." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 646 + } + ], + "type": { + "name": "SendRequestFunction", + "type": "reference", + "target": "488" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.send_request", + "target": 508, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Add requests crawling context helper function." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4348, + "module": "_types", + "name": "add_requests", + "parsedDocstring": { + "text": "Add requests crawling context helper function." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 649 + } + ], + "type": { + "name": "AddRequestsFunction", + "type": "reference", + "target": "422" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.add_requests", + "target": 509, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Push data crawling context helper function." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4349, + "module": "_types", + "name": "push_data", + "parsedDocstring": { + "text": "Push data crawling context helper function." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 652 + } + ], + "type": { + "name": "PushDataFunction", + "type": "reference", + "target": "480" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.push_data", + "target": 510, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Use state crawling context helper function." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4350, + "module": "_types", + "name": "use_state", + "parsedDocstring": { + "text": "Use state crawling context helper function." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 655 + } + ], + "type": { + "name": "UseStateFunction", + "type": "reference", + "target": "500" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.use_state", + "target": 511, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get key-value store crawling context helper function." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4351, + "module": "_types", + "name": "get_key_value_store", + "parsedDocstring": { + "text": "Get key-value store crawling context helper function." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 658 + } + ], + "type": { + "name": "GetKeyValueStoreFromRequestHandlerFunction", + "type": "reference", + "target": "474" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.get_key_value_store", + "target": 512, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Logger instance." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4352, + "module": "_types", + "name": "log", + "parsedDocstring": { + "text": "Logger instance." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 661 + } + ], + "type": { + "name": "logging.Logger", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.log", + "target": 513, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return hash of the context. Each context is considered unique." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4353, + "module": "_types", + "name": "__hash__", + "parsedDocstring": { + "text": "Return hash of the context. Each context is considered unique." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 668 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return hash of the context. Each context is considered unique." + } + ] + }, + "flags": {}, + "id": 517, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__hash__", + "parameters": [], + "type": { + "name": "int", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.__hash__", + "target": 516, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawlingContext.__hash__", + "target": 516, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a modified copy of the crawling context with specified changes." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4354, + "module": "_types", + "name": "create_modified_copy", + "parsedDocstring": { + "text": "Create a modified copy of the crawling context with specified changes." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 672 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a modified copy of the crawling context with specified changes." + } + ] + }, + "flags": {}, + "id": 519, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "create_modified_copy", + "parameters": [ + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 520, + "kind": 32768, + "kindString": "Parameter", + "name": "push_data", + "type": { + "name": "PushDataFunction | None", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 521, + "kind": 32768, + "kindString": "Parameter", + "name": "add_requests", + "type": { + "name": "AddRequestsFunction | None", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 522, + "kind": 32768, + "kindString": "Parameter", + "name": "get_key_value_store", + "type": { + "name": "GetKeyValueStoreFromRequestHandlerFunction | None", + "type": "reference" + } + } + ], + "type": { + "name": "Self", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.create_modified_copy", + "target": 518, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawlingContext.create_modified_copy", + "target": 518, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "args": "(frozen=True)", + "name": "dataclass" + }, + { + "args": "('Crawling contexts')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 4353, + 4354, + 4341, + 4340, + 1462, + 1466, + 4342, + 1458, + 1454, + 1450, + 1446 + ], + "title": "Methods" + }, + { + "children": [ + 4348, + 4338, + 4339, + 4351, + 4343, + 1444, + 4352, + 1443, + 4337, + 4346, + 4349, + 4344, + 1445, + 4347, + 4345, + 4350 + ], + "title": "Properties" + } + ], + "id": 1442, + "module": "crawlers._adaptive_playwright._adaptive_playwright_crawling_context", + "name": "AdaptivePlaywrightCrawlingContext", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 35 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "ParsedHttpCrawlingContext", + "target": "1252", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Blocks network requests matching specified URL patterns." + } + ] + }, + "flags": {}, + "groups": [], + "id": 1471, + "module": "crawlers._adaptive_playwright._adaptive_playwright_crawling_context", + "name": "block_requests", + "parsedDocstring": { + "text": "Blocks network requests matching specified URL patterns." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 215 + } + ], + "type": { + "name": "BlockRequestsFunction | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "BlockRequestsFunction", + "target": "1793" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Additional options to pass to Playwright's `Page.goto()` method. The `timeout` option is not supported." + } + ] + }, + "flags": {}, + "groups": [], + "id": 1472, + "module": "crawlers._adaptive_playwright._adaptive_playwright_crawling_context", + "name": "goto_options", + "parsedDocstring": { + "text": "Additional options to pass to Playwright's `Page.goto()` method. The `timeout` option is not supported." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 218 + } + ], + "type": { + "name": "GotoOptions | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "GotoOptions", + "target": "1810" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The Playwright `Page` object for the current page.\n\nRaises `AdaptiveContextError` if accessed during static crawling." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 1473, + "module": "crawlers._adaptive_playwright._adaptive_playwright_crawling_context", + "name": "page", + "parsedDocstring": { + "text": "The Playwright `Page` object for the current page.\n\nRaises `AdaptiveContextError` if accessed during static crawling." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 222 + } + ], + "type": { + "name": "Page", + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance from an existing pre-navigation `BasicCrawlingContext`." + } + ] + }, + "decorations": [ + { + "name": "classmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1474, + "module": "crawlers._adaptive_playwright._adaptive_playwright_crawling_context", + "name": "from_pre_navigation_context", + "parsedDocstring": { + "text": "Initialize a new instance from an existing pre-navigation `BasicCrawlingContext`." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 236 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance from an existing pre-navigation `BasicCrawlingContext`." + } + ] + }, + "flags": {}, + "id": 1475, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "from_pre_navigation_context", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1476, + "kind": 32768, + "kindString": "Parameter", + "name": "context", + "type": { + "name": "BasicCrawlingContext", + "type": "reference", + "target": "504" + } + } + ], + "type": { + "name": "Self", + "type": "reference" + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Request object for the current page being processed." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4255, + "module": "_types", + "name": "request", + "parsedDocstring": { + "text": "Request object for the current page being processed." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 637 + } + ], + "type": { + "name": "Request", + "type": "reference", + "target": "150" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.request", + "target": 505, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Session object for the current page being processed." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4256, + "module": "_types", + "name": "session", + "parsedDocstring": { + "text": "Session object for the current page being processed." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 640 + } + ], + "type": { + "name": "Session | None", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.session", + "target": 506, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Proxy information for the current page being processed." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4257, + "module": "_types", + "name": "proxy_info", + "parsedDocstring": { + "text": "Proxy information for the current page being processed." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 643 + } + ], + "type": { + "name": "ProxyInfo | None", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.proxy_info", + "target": 507, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Send request crawling context helper function." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4258, + "module": "_types", + "name": "send_request", + "parsedDocstring": { + "text": "Send request crawling context helper function." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 646 + } + ], + "type": { + "name": "SendRequestFunction", + "type": "reference", + "target": "488" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.send_request", + "target": 508, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Add requests crawling context helper function." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4259, + "module": "_types", + "name": "add_requests", + "parsedDocstring": { + "text": "Add requests crawling context helper function." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 649 + } + ], + "type": { + "name": "AddRequestsFunction", + "type": "reference", + "target": "422" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.add_requests", + "target": 509, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Push data crawling context helper function." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4260, + "module": "_types", + "name": "push_data", + "parsedDocstring": { + "text": "Push data crawling context helper function." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 652 + } + ], + "type": { + "name": "PushDataFunction", + "type": "reference", + "target": "480" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.push_data", + "target": 510, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Use state crawling context helper function." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4261, + "module": "_types", + "name": "use_state", + "parsedDocstring": { + "text": "Use state crawling context helper function." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 655 + } + ], + "type": { + "name": "UseStateFunction", + "type": "reference", + "target": "500" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.use_state", + "target": 511, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get key-value store crawling context helper function." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4262, + "module": "_types", + "name": "get_key_value_store", + "parsedDocstring": { + "text": "Get key-value store crawling context helper function." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 658 + } + ], + "type": { + "name": "GetKeyValueStoreFromRequestHandlerFunction", + "type": "reference", + "target": "474" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.get_key_value_store", + "target": 512, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Logger instance." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4263, + "module": "_types", + "name": "log", + "parsedDocstring": { + "text": "Logger instance." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 661 + } + ], + "type": { + "name": "logging.Logger", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.log", + "target": 513, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get snapshot of crawled page." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4264, + "module": "_types", + "name": "get_snapshot", + "parsedDocstring": { + "text": "Get snapshot of crawled page." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 664 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get snapshot of crawled page." + } + ] + }, + "flags": {}, + "id": 515, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_snapshot", + "parameters": [], + "type": { + "name": "PageSnapshot", + "type": "reference", + "target": "495" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.get_snapshot", + "target": 514, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawlingContext.get_snapshot", + "target": 514, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return hash of the context. Each context is considered unique." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4265, + "module": "_types", + "name": "__hash__", + "parsedDocstring": { + "text": "Return hash of the context. Each context is considered unique." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 668 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return hash of the context. Each context is considered unique." + } + ] + }, + "flags": {}, + "id": 517, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__hash__", + "parameters": [], + "type": { + "name": "int", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.__hash__", + "target": 516, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawlingContext.__hash__", + "target": 516, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a modified copy of the crawling context with specified changes." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4266, + "module": "_types", + "name": "create_modified_copy", + "parsedDocstring": { + "text": "Create a modified copy of the crawling context with specified changes." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 672 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a modified copy of the crawling context with specified changes." + } + ] + }, + "flags": {}, + "id": 519, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "create_modified_copy", + "parameters": [ + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 520, + "kind": 32768, + "kindString": "Parameter", + "name": "push_data", + "type": { + "name": "PushDataFunction | None", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 521, + "kind": 32768, + "kindString": "Parameter", + "name": "add_requests", + "type": { + "name": "AddRequestsFunction | None", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 522, + "kind": 32768, + "kindString": "Parameter", + "name": "get_key_value_store", + "type": { + "name": "GetKeyValueStoreFromRequestHandlerFunction | None", + "type": "reference" + } + } + ], + "type": { + "name": "Self", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.create_modified_copy", + "target": 518, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawlingContext.create_modified_copy", + "target": 518, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A wrapper around BasicCrawlingContext or AdaptivePlaywrightCrawlingContext.\n\nTrying to access `page` on this context will raise AdaptiveContextError if wrapped context is BasicCrawlingContext." + } + ] + }, + "decorations": [ + { + "args": "(frozen=True)", + "name": "dataclass" + }, + { + "args": "('Crawling contexts')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 4265, + 4266, + 1474, + 4264 + ], + "title": "Methods" + }, + { + "children": [ + 4259, + 1471, + 4262, + 1472, + 4263, + 1473, + 4257, + 4260, + 4255, + 4258, + 4256, + 4261 + ], + "title": "Properties" + } + ], + "id": 1470, + "module": "crawlers._adaptive_playwright._adaptive_playwright_crawling_context", + "name": "AdaptivePlaywrightPreNavCrawlingContext", + "parsedDocstring": { + "text": "A wrapper around BasicCrawlingContext or AdaptivePlaywrightCrawlingContext.\n\nTrying to access `page` on this context will raise AdaptiveContextError if wrapped context is BasicCrawlingContext." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 208 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "BasicCrawlingContext", + "target": "504", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The Playwright `Page` object for the current page.\n\nRaises `AdaptiveContextError` if accessed during static crawling." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 1478, + "module": "crawlers._adaptive_playwright._adaptive_playwright_crawling_context", + "name": "page", + "parsedDocstring": { + "text": "The Playwright `Page` object for the current page.\n\nRaises `AdaptiveContextError` if accessed during static crawling." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 264 + } + ], + "type": { + "name": "Page", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The Playwright `Response` object containing the response details for the current URL.\n\nRaises `AdaptiveContextError` if accessed during static crawling." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 1479, + "module": "crawlers._adaptive_playwright._adaptive_playwright_crawling_context", + "name": "response", + "parsedDocstring": { + "text": "The Playwright `Response` object containing the response details for the current URL.\n\nRaises `AdaptiveContextError` if accessed during static crawling." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 274 + } + ], + "type": { + "name": "Response", + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance from an existing post-navigation context." + } + ] + }, + "decorations": [ + { + "name": "classmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1480, + "module": "crawlers._adaptive_playwright._adaptive_playwright_crawling_context", + "name": "from_post_navigation_context", + "parsedDocstring": { + "text": "Initialize a new instance from an existing post-navigation context." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 284 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance from an existing post-navigation context." + } + ] + }, + "flags": {}, + "id": 1481, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "from_post_navigation_context", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1482, + "kind": 32768, + "kindString": "Parameter", + "name": "context", + "type": { + "name": "HttpCrawlingContext | PlaywrightPostNavCrawlingContext", + "type": "union", + "types": [ + { + "type": "reference", + "name": "HttpCrawlingContext", + "target": "1245" + }, + { + "type": "reference", + "name": "PlaywrightPostNavCrawlingContext", + "target": "1846" + } + ] + } + } + ], + "type": { + "name": "Self", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance from an existing `BasicCrawlingContext`." + } + ] + }, + "decorations": [ + { + "name": "classmethod" + } + ], + "flags": {}, + "groups": [], + "id": 4323, + "module": "crawlers._abstract_http._http_crawling_context", + "name": "from_basic_crawling_context", + "parsedDocstring": { + "text": "Initialize a new instance from an existing `BasicCrawlingContext`." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 22 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance from an existing `BasicCrawlingContext`." + } + ] + }, + "flags": {}, + "id": 1247, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "from_basic_crawling_context", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1248, + "kind": 32768, + "kindString": "Parameter", + "name": "context", + "type": { + "name": "BasicCrawlingContext", + "type": "reference", + "target": "504" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1249, + "kind": 32768, + "kindString": "Parameter", + "name": "http_response", + "type": { + "name": "HttpResponse", + "type": "reference", + "target": "2060" + } + } + ], + "type": { + "name": "Self", + "type": "reference" + }, + "inheritedFrom": { + "name": "HttpCrawlingContext.from_basic_crawling_context", + "target": 1246, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "HttpCrawlingContext.from_basic_crawling_context", + "target": 1246, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get snapshot of crawled page." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4324, + "module": "crawlers._abstract_http._http_crawling_context", + "name": "get_snapshot", + "parsedDocstring": { + "text": "Get snapshot of crawled page." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 27 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get snapshot of crawled page." + } + ] + }, + "flags": {}, + "id": 1251, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_snapshot", + "parameters": [], + "type": { + "name": "PageSnapshot", + "type": "reference", + "target": "495" + }, + "overwrites": { + "name": "BasicCrawlingContext.get_snapshot", + "target": 514, + "type": "reference" + }, + "inheritedFrom": { + "name": "HttpCrawlingContext.get_snapshot", + "target": 1250, + "type": "reference" + } + } + ], + "overwrites": { + "name": "BasicCrawlingContext.get_snapshot", + "target": 514, + "type": "reference" + }, + "inheritedFrom": { + "name": "HttpCrawlingContext.get_snapshot", + "target": 1250, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The HTTP response received from the server." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4325, + "module": "http_clients._base", + "name": "http_response", + "parsedDocstring": { + "text": "The HTTP response received from the server." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/http_clients/_base.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 70 + } + ], + "type": { + "name": "HttpResponse", + "type": "reference", + "target": "2060" + }, + "inheritedFrom": { + "name": "HttpCrawlingResult.http_response", + "target": 2069, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Request object for the current page being processed." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4326, + "module": "_types", + "name": "request", + "parsedDocstring": { + "text": "Request object for the current page being processed." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 637 + } + ], + "type": { + "name": "Request", + "type": "reference", + "target": "150" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.request", + "target": 505, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Session object for the current page being processed." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4327, + "module": "_types", + "name": "session", + "parsedDocstring": { + "text": "Session object for the current page being processed." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 640 + } + ], + "type": { + "name": "Session | None", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.session", + "target": 506, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Proxy information for the current page being processed." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4328, + "module": "_types", + "name": "proxy_info", + "parsedDocstring": { + "text": "Proxy information for the current page being processed." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 643 + } + ], + "type": { + "name": "ProxyInfo | None", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.proxy_info", + "target": 507, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Send request crawling context helper function." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4329, + "module": "_types", + "name": "send_request", + "parsedDocstring": { + "text": "Send request crawling context helper function." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 646 + } + ], + "type": { + "name": "SendRequestFunction", + "type": "reference", + "target": "488" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.send_request", + "target": 508, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Add requests crawling context helper function." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4330, + "module": "_types", + "name": "add_requests", + "parsedDocstring": { + "text": "Add requests crawling context helper function." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 649 + } + ], + "type": { + "name": "AddRequestsFunction", + "type": "reference", + "target": "422" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.add_requests", + "target": 509, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Push data crawling context helper function." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4331, + "module": "_types", + "name": "push_data", + "parsedDocstring": { + "text": "Push data crawling context helper function." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 652 + } + ], + "type": { + "name": "PushDataFunction", + "type": "reference", + "target": "480" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.push_data", + "target": 510, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Use state crawling context helper function." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4332, + "module": "_types", + "name": "use_state", + "parsedDocstring": { + "text": "Use state crawling context helper function." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 655 + } + ], + "type": { + "name": "UseStateFunction", + "type": "reference", + "target": "500" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.use_state", + "target": 511, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get key-value store crawling context helper function." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4333, + "module": "_types", + "name": "get_key_value_store", + "parsedDocstring": { + "text": "Get key-value store crawling context helper function." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 658 + } + ], + "type": { + "name": "GetKeyValueStoreFromRequestHandlerFunction", + "type": "reference", + "target": "474" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.get_key_value_store", + "target": 512, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Logger instance." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4334, + "module": "_types", + "name": "log", + "parsedDocstring": { + "text": "Logger instance." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 661 + } + ], + "type": { + "name": "logging.Logger", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.log", + "target": 513, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return hash of the context. Each context is considered unique." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4335, + "module": "_types", + "name": "__hash__", + "parsedDocstring": { + "text": "Return hash of the context. Each context is considered unique." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 668 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return hash of the context. Each context is considered unique." + } + ] + }, + "flags": {}, + "id": 517, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__hash__", + "parameters": [], + "type": { + "name": "int", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.__hash__", + "target": 516, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawlingContext.__hash__", + "target": 516, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a modified copy of the crawling context with specified changes." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4336, + "module": "_types", + "name": "create_modified_copy", + "parsedDocstring": { + "text": "Create a modified copy of the crawling context with specified changes." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 672 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a modified copy of the crawling context with specified changes." + } + ] + }, + "flags": {}, + "id": 519, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "create_modified_copy", + "parameters": [ + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 520, + "kind": 32768, + "kindString": "Parameter", + "name": "push_data", + "type": { + "name": "PushDataFunction | None", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 521, + "kind": 32768, + "kindString": "Parameter", + "name": "add_requests", + "type": { + "name": "AddRequestsFunction | None", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 522, + "kind": 32768, + "kindString": "Parameter", + "name": "get_key_value_store", + "type": { + "name": "GetKeyValueStoreFromRequestHandlerFunction | None", + "type": "reference" + } + } + ], + "type": { + "name": "Self", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.create_modified_copy", + "target": 518, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawlingContext.create_modified_copy", + "target": 518, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A wrapper around HttpCrawlingContext or AdaptivePlaywrightCrawlingContext.\n\nTrying to access `page` on this context will raise AdaptiveContextError if wrapped context is HttpCrawlingContext." + } + ] + }, + "decorations": [ + { + "args": "(frozen=True)", + "name": "dataclass" + }, + { + "args": "('Crawling contexts')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 4335, + 4336, + 4323, + 1480, + 4324 + ], + "title": "Methods" + }, + { + "children": [ + 4330, + 4333, + 4325, + 4334, + 1478, + 4328, + 4331, + 4326, + 1479, + 4329, + 4327, + 4332 + ], + "title": "Properties" + } + ], + "id": 1477, + "module": "crawlers._adaptive_playwright._adaptive_playwright_crawling_context", + "name": "AdaptivePlaywrightPostNavCrawlingContext", + "parsedDocstring": { + "text": "A wrapper around HttpCrawlingContext or AdaptivePlaywrightCrawlingContext.\n\nTrying to access `page` on this context will raise AdaptiveContextError if wrapped context is HttpCrawlingContext." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 254 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "HttpCrawlingContext", + "target": "1245", + "type": "reference" + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1483, + "module": "crawlers._basic._context_pipeline", + "name": "TCrawlingContext", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_context_pipeline.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 20 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1484, + "module": "crawlers._basic._context_pipeline", + "name": "TMiddlewareCrawlingContext", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_context_pipeline.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 21 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1486, + "module": "crawlers._basic._context_pipeline", + "name": "__init__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_context_pipeline.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 27 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 1487, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1488, + "kind": 32768, + "kindString": "Parameter", + "name": "middleware", + "type": { + "name": "Callable", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "[TCrawlingContext]" + }, + { + "type": "reference", + "name": "AsyncGenerator", + "typeArguments": [ + { + "type": "reference", + "name": "TMiddlewareCrawlingContext", + "target": "1484" + }, + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "Exception" + }, + { + "type": "literal", + "value": null + } + ] + } + ] + } + ] + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1489, + "kind": 32768, + "kindString": "Parameter", + "name": "input_context", + "type": { + "name": "TCrawlingContext", + "type": "reference", + "target": "41" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1490, + "module": "crawlers._basic._context_pipeline", + "name": "action", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_context_pipeline.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 39 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 1491, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "action", + "parameters": [], + "type": { + "name": "TMiddlewareCrawlingContext", + "type": "reference", + "target": "1484" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1492, + "module": "crawlers._basic._context_pipeline", + "name": "cleanup", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_context_pipeline.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 43 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 1493, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "cleanup", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1494, + "kind": 32768, + "kindString": "Parameter", + "name": "final_consumer_exception", + "type": { + "name": "Exception | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Exception" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Helper wrapper class to make the middleware easily observable by open telemetry instrumentation." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 1486, + 1490, + 1492 + ], + "title": "Methods" + } + ], + "id": 1485, + "module": "crawlers._basic._context_pipeline", + "name": "_Middleware", + "parsedDocstring": { + "text": "Helper wrapper class to make the middleware easily observable by open telemetry instrumentation." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_context_pipeline.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 24 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1496, + "module": "crawlers._basic._context_pipeline", + "name": "__init__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_context_pipeline.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 63 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 1497, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1498, + "kind": 32768, + "kindString": "Parameter", + "name": "_middleware", + "type": { + "name": "Callable[ [TCrawlingContext], AsyncGenerator[TMiddlewareCrawlingContext, Exception | None], ] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Callable", + "typeArguments": [ + { + "type": "reference", + "name": "[TCrawlingContext]" + }, + { + "type": "reference", + "name": "AsyncGenerator", + "typeArguments": [ + { + "type": "reference", + "name": "TMiddlewareCrawlingContext", + "target": "1484" + }, + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "Exception" + }, + { + "type": "literal", + "value": null + } + ] + } + ] + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1499, + "kind": 32768, + "kindString": "Parameter", + "name": "_parent", + "type": { + "name": "ContextPipeline[BasicCrawlingContext] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ContextPipeline", + "typeArguments": [ + { + "type": "reference", + "name": "BasicCrawlingContext", + "target": "504" + } + ], + "target": "1495" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Run a crawling context through the middleware chain and pipe it into a consumer function.\n\nExceptions from the consumer function are wrapped together with the final crawling context." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1500, + "module": "crawlers._basic._context_pipeline", + "name": "__call__", + "parsedDocstring": { + "text": "Run a crawling context through the middleware chain and pipe it into a consumer function.\n\nExceptions from the consumer function are wrapped together with the final crawling context." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_context_pipeline.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 82 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Run a crawling context through the middleware chain and pipe it into a consumer function.\n\nExceptions from the consumer function are wrapped together with the final crawling context." + } + ] + }, + "flags": {}, + "id": 1501, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "__call__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1502, + "kind": 32768, + "kindString": "Parameter", + "name": "crawling_context", + "type": { + "name": "BasicCrawlingContext", + "type": "reference", + "target": "504" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1503, + "kind": 32768, + "kindString": "Parameter", + "name": "final_context_consumer", + "type": { + "name": "Callable", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "[TCrawlingContext]" + }, + { + "type": "reference", + "name": "Awaitable", + "typeArguments": [ + { + "type": "literal", + "value": null + } + ] + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Add a middleware to the pipeline.\n\nThe middleware should yield exactly once, and it should yield an (optionally) extended crawling context object.\nThe part before the yield can be used for initialization and the part after it for cleanup.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1504, + "module": "crawlers._basic._context_pipeline", + "name": "compose", + "parsedDocstring": { + "text": "Add a middleware to the pipeline.\n\nThe middleware should yield exactly once, and it should yield an (optionally) extended crawling context object.\nThe part before the yield can be used for initialization and the part after it for cleanup.\n", + "returns": "The extended pipeline instance, providing a fluent interface" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_context_pipeline.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 125 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "The extended pipeline instance, providing a fluent interface" + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Add a middleware to the pipeline.\n\nThe middleware should yield exactly once, and it should yield an (optionally) extended crawling context object.\nThe part before the yield can be used for initialization and the part after it for cleanup.\n" + } + ] + }, + "flags": {}, + "id": 1505, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "compose", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1506, + "kind": 32768, + "kindString": "Parameter", + "name": "middleware", + "type": { + "name": "Callable", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "[TCrawlingContext]" + }, + { + "type": "reference", + "name": "AsyncGenerator", + "typeArguments": [ + { + "type": "reference", + "name": "TMiddlewareCrawlingContext", + "target": "1484" + }, + { + "type": "literal", + "value": null + } + ] + } + ] + } + } + ], + "type": { + "name": "ContextPipeline", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "TMiddlewareCrawlingContext", + "target": "1484" + } + ], + "target": "1495" + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Encapsulates the logic of gradually enhancing the crawling context with additional information and utilities.\n\nThe enhancement is done by a chain of middlewares that are added to the pipeline after it's creation." + } + ] + }, + "decorations": [ + { + "args": "('Other')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 1500, + 1496, + 1504 + ], + "title": "Methods" + } + ], + "id": 1495, + "module": "crawlers._basic._context_pipeline", + "name": "ContextPipeline", + "parsedDocstring": { + "text": "Encapsulates the logic of gradually enhancing the crawling context with additional information and utilities.\n\nThe enhancement is done by a chain of middlewares that are added to the pipeline after it's creation." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_context_pipeline.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 57 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Replace context's isolated copies with originals after handler execution." + } + ] + }, + "decorations": [ + { + "name": "contextmanager" + } + ], + "flags": {}, + "groups": [], + "id": 1507, + "module": "crawlers._basic._context_utils", + "name": "swapped_context", + "parsedDocstring": { + "text": "Replace context's isolated copies with originals after handler execution." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_context_utils.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 15 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Replace context's isolated copies with originals after handler execution." + } + ] + }, + "flags": {}, + "id": 1508, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "swapped_context", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1509, + "kind": 32768, + "kindString": "Parameter", + "name": "context", + "type": { + "name": "BasicCrawlingContext", + "type": "reference", + "target": "504" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1510, + "kind": 32768, + "kindString": "Parameter", + "name": "request", + "type": { + "name": "Request", + "type": "reference", + "target": "150" + } + } + ], + "type": { + "name": "Iterator", + "type": "reference", + "typeArguments": [ + { + "type": "literal", + "value": null + } + ] + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1511, + "module": "crawlers._basic._logging_utils", + "name": "reduce_asyncio_timeout_error_to_relevant_traceback_parts", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_logging_utils.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 48 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 1512, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "reduce_asyncio_timeout_error_to_relevant_traceback_parts", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1513, + "kind": 32768, + "kindString": "Parameter", + "name": "timeout_error", + "type": { + "name": "asyncio.exceptions.TimeoutError | crawlee.errors.UserHandlerTimeoutError", + "type": "union", + "types": [ + { + "type": "reference", + "name": "asyncio.exceptions.TimeoutError" + }, + { + "type": "reference", + "name": "crawlee.errors.UserHandlerTimeoutError" + } + ] + } + } + ], + "type": { + "name": "list", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1514, + "module": "crawlers._basic._logging_utils", + "name": "get_one_line_error_summary_if_possible", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_logging_utils.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 62 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 1515, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "get_one_line_error_summary_if_possible", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1516, + "kind": 32768, + "kindString": "Parameter", + "name": "error", + "type": { + "name": "Exception", + "type": "reference" + } + } + ], + "type": { + "name": "str", + "type": "reference" + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1517, + "module": "crawlers._basic._basic_crawler", + "name": "TCrawlingContext", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 102 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1518, + "module": "crawlers._basic._basic_crawler", + "name": "TStatisticsState", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 103 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1519, + "module": "crawlers._basic._basic_crawler", + "name": "TRequestIterator", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 104 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1520, + "module": "crawlers._basic._basic_crawler", + "name": "TParams", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 105 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1521, + "module": "crawlers._basic._basic_crawler", + "name": "T", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 106 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1522, + "module": "crawlers._basic._basic_crawler", + "name": "ErrorHandler", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 108 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1523, + "module": "crawlers._basic._basic_crawler", + "name": "FailedRequestHandler", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 109 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1524, + "module": "crawlers._basic._basic_crawler", + "name": "SkippedRequestCallback", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 110 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The `Configuration` instance. Some of its properties are used as defaults for the crawler." + } + ] + }, + "flags": {}, + "groups": [], + "id": 1526, + "module": "crawlers._basic._basic_crawler", + "name": "configuration", + "parsedDocstring": { + "text": "The `Configuration` instance. Some of its properties are used as defaults for the crawler." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 116 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Configuration", + "target": "235" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The event manager for managing events for the crawler and all its components." + } + ] + }, + "flags": {}, + "groups": [], + "id": 1527, + "module": "crawlers._basic._basic_crawler", + "name": "event_manager", + "parsedDocstring": { + "text": "The event manager for managing events for the crawler and all its components." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 119 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "EventManager", + "target": "1907" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The storage client for managing storages for the crawler and all its components." + } + ] + }, + "flags": {}, + "groups": [], + "id": 1528, + "module": "crawlers._basic._basic_crawler", + "name": "storage_client", + "parsedDocstring": { + "text": "The storage client for managing storages for the crawler and all its components." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 122 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "StorageClient", + "target": "2784" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Manager of requests that should be processed by the crawler." + } + ] + }, + "flags": {}, + "groups": [], + "id": 1529, + "module": "crawlers._basic._basic_crawler", + "name": "request_manager", + "parsedDocstring": { + "text": "Manager of requests that should be processed by the crawler." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 125 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "RequestManager", + "target": "2339" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A custom `SessionPool` instance, allowing the use of non-default configuration." + } + ] + }, + "flags": {}, + "groups": [], + "id": 1530, + "module": "crawlers._basic._basic_crawler", + "name": "session_pool", + "parsedDocstring": { + "text": "A custom `SessionPool` instance, allowing the use of non-default configuration." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 128 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "SessionPool", + "target": "2498" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP proxy configuration used when making requests." + } + ] + }, + "flags": {}, + "groups": [], + "id": 1531, + "module": "crawlers._basic._basic_crawler", + "name": "proxy_configuration", + "parsedDocstring": { + "text": "HTTP proxy configuration used when making requests." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 131 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "ProxyConfiguration", + "target": "263" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP client used by `BasicCrawlingContext.send_request` method." + } + ] + }, + "flags": {}, + "groups": [], + "id": 1532, + "module": "crawlers._basic._basic_crawler", + "name": "http_client", + "parsedDocstring": { + "text": "HTTP client used by `BasicCrawlingContext.send_request` method." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 134 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "HttpClient", + "target": "2070" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Specifies the maximum number of retries allowed for a request if its processing fails.\nThis includes retries due to navigation errors or errors thrown from user-supplied functions\n(`request_handler`, `pre_navigation_hooks` etc.).\n\nThis limit does not apply to retries triggered by session rotation (see `max_session_rotations`)." + } + ] + }, + "flags": {}, + "groups": [], + "id": 1533, + "module": "crawlers._basic._basic_crawler", + "name": "max_request_retries", + "parsedDocstring": { + "text": "Specifies the maximum number of retries allowed for a request if its processing fails.\nThis includes retries due to navigation errors or errors thrown from user-supplied functions\n(`request_handler`, `pre_navigation_hooks` etc.).\n\nThis limit does not apply to retries triggered by session rotation (see `max_session_rotations`)." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 137 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum number of pages to open during a crawl. The crawl stops upon reaching this limit.\nSetting this value can help avoid infinite loops in misconfigured crawlers. `None` means no limit.\nDue to concurrency settings, the actual number of pages visited may slightly exceed this value." + } + ] + }, + "flags": {}, + "groups": [], + "id": 1534, + "module": "crawlers._basic._basic_crawler", + "name": "max_requests_per_crawl", + "parsedDocstring": { + "text": "Maximum number of pages to open during a crawl. The crawl stops upon reaching this limit.\nSetting this value can help avoid infinite loops in misconfigured crawlers. `None` means no limit.\nDue to concurrency settings, the actual number of pages visited may slightly exceed this value." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 144 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum number of session rotations per request. The crawler rotates the session if a proxy error occurs\nor if the website blocks the request.\n\nThe session rotations are not counted towards the `max_request_retries` limit." + } + ] + }, + "flags": {}, + "groups": [], + "id": 1535, + "module": "crawlers._basic._basic_crawler", + "name": "max_session_rotations", + "parsedDocstring": { + "text": "Maximum number of session rotations per request. The crawler rotates the session if a proxy error occurs\nor if the website blocks the request.\n\nThe session rotations are not counted towards the `max_request_retries` limit." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 149 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Specifies the maximum crawl depth. If set, the crawler will stop processing links beyond this depth.\nThe crawl depth starts at 0 for initial requests and increases with each subsequent level of links.\nRequests at the maximum depth will still be processed, but no new links will be enqueued from those requests.\nIf not set, crawling continues without depth restrictions." + } + ] + }, + "flags": {}, + "groups": [], + "id": 1536, + "module": "crawlers._basic._basic_crawler", + "name": "max_crawl_depth", + "parsedDocstring": { + "text": "Specifies the maximum crawl depth. If set, the crawler will stop processing links beyond this depth.\nThe crawl depth starts at 0 for initial requests and increases with each subsequent level of links.\nRequests at the maximum depth will still be processed, but no new links will be enqueued from those requests.\nIf not set, crawling continues without depth restrictions." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 156 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Enable the use of a session pool for managing sessions during crawling." + } + ] + }, + "flags": {}, + "groups": [], + "id": 1537, + "module": "crawlers._basic._basic_crawler", + "name": "use_session_pool", + "parsedDocstring": { + "text": "Enable the use of a session pool for managing sessions during crawling." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 163 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, the crawler attempts to bypass bot protections automatically." + } + ] + }, + "flags": {}, + "groups": [], + "id": 1538, + "module": "crawlers._basic._basic_crawler", + "name": "retry_on_blocked", + "parsedDocstring": { + "text": "If True, the crawler attempts to bypass bot protections automatically." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 166 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Settings to fine-tune concurrency levels." + } + ] + }, + "flags": {}, + "groups": [], + "id": 1539, + "module": "crawlers._basic._basic_crawler", + "name": "concurrency_settings", + "parsedDocstring": { + "text": "Settings to fine-tune concurrency levels." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 169 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "ConcurrencySettings", + "target": "328" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum duration allowed for a single request handler to run." + } + ] + }, + "flags": {}, + "groups": [], + "id": 1540, + "module": "crawlers._basic._basic_crawler", + "name": "request_handler_timeout", + "parsedDocstring": { + "text": "Maximum duration allowed for a single request handler to run." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 172 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "timedelta" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, the crawler stops immediately when any request handler error occurs." + } + ] + }, + "flags": {}, + "groups": [], + "id": 1541, + "module": "crawlers._basic._basic_crawler", + "name": "abort_on_error", + "parsedDocstring": { + "text": "If True, the crawler stops immediately when any request handler error occurs." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 175 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, the crawler will set up logging infrastructure automatically." + } + ] + }, + "flags": {}, + "groups": [], + "id": 1542, + "module": "crawlers._basic._basic_crawler", + "name": "configure_logging", + "parsedDocstring": { + "text": "If True, the crawler will set up logging infrastructure automatically." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 178 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If 'table', displays crawler statistics as formatted tables in logs. If 'inline', outputs statistics as plain\ntext log messages." + } + ] + }, + "flags": {}, + "groups": [], + "id": 1543, + "module": "crawlers._basic._basic_crawler", + "name": "statistics_log_format", + "parsedDocstring": { + "text": "If 'table', displays crawler statistics as formatted tables in logs. If 'inline', outputs statistics as plain\ntext log messages." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 181 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Literal", + "typeArguments": [ + { + "type": "literal", + "value": "table" + }, + { + "type": "literal", + "value": "inline" + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Flag that can keep crawler running even when there are no requests in queue." + } + ] + }, + "flags": {}, + "groups": [], + "id": 1544, + "module": "crawlers._basic._basic_crawler", + "name": "keep_alive", + "parsedDocstring": { + "text": "Flag that can keep crawler running even when there are no requests in queue." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 186 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Additional HTTP status codes to treat as errors, triggering automatic retries when encountered." + } + ] + }, + "flags": {}, + "groups": [], + "id": 1545, + "module": "crawlers._basic._basic_crawler", + "name": "additional_http_error_status_codes", + "parsedDocstring": { + "text": "Additional HTTP status codes to treat as errors, triggering automatic retries when encountered." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 189 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Iterable", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP status codes that are typically considered errors but should be treated as successful responses." + } + ] + }, + "flags": {}, + "groups": [], + "id": 1546, + "module": "crawlers._basic._basic_crawler", + "name": "ignore_http_error_status_codes", + "parsedDocstring": { + "text": "HTTP status codes that are typically considered errors but should be treated as successful responses." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 192 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Iterable", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If set to `True`, the crawler will automatically try to fetch the robots.txt file for each domain,\nand skip those that are not allowed. This also prevents disallowed URLs to be added via `EnqueueLinksFunction`." + } + ] + }, + "flags": {}, + "groups": [], + "id": 1547, + "module": "crawlers._basic._basic_crawler", + "name": "respect_robots_txt_file", + "parsedDocstring": { + "text": "If set to `True`, the crawler will automatically try to fetch the robots.txt file for each domain,\nand skip those that are not allowed. This also prevents disallowed URLs to be added via `EnqueueLinksFunction`." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 203 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Interval for logging the crawler status messages." + } + ] + }, + "flags": {}, + "groups": [], + "id": 1548, + "module": "crawlers._basic._basic_crawler", + "name": "status_message_logging_interval", + "parsedDocstring": { + "text": "Interval for logging the crawler status messages." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 207 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "timedelta" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Allows overriding the default status message. The default status message is provided in the parameters.\nReturning `None` suppresses the status message." + } + ] + }, + "flags": {}, + "groups": [], + "id": 1549, + "module": "crawlers._basic._basic_crawler", + "name": "status_message_callback", + "parsedDocstring": { + "text": "Allows overriding the default status message. The default status message is provided in the parameters.\nReturning `None` suppresses the status message." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 210 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Callable", + "typeArguments": [ + { + "type": "reference", + "name": "[StatisticsState, StatisticsState | None, str]" + }, + { + "type": "reference", + "name": "Awaitable", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + ] + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Identifier used for crawler state tracking. Use the same id across multiple crawlers to share state between\nthem." + } + ] + }, + "flags": {}, + "groups": [], + "id": 1550, + "module": "crawlers._basic._basic_crawler", + "name": "id", + "parsedDocstring": { + "text": "Identifier used for crawler state tracking. Use the same id across multiple crawlers to share state between\nthem." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 216 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Non-generic options the `BasicCrawler` constructor." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 1541, + 1545, + 1539, + 1526, + 1542, + 1527, + 1532, + 1550, + 1546, + 1544, + 1536, + 1533, + 1534, + 1535, + 1531, + 1540, + 1529, + 1547, + 1538, + 1530, + 1543, + 1549, + 1548, + 1528, + 1537 + ], + "title": "Properties" + } + ], + "id": 1525, + "module": "crawlers._basic._basic_crawler", + "name": "_BasicCrawlerOptions", + "parsedDocstring": { + "text": "Non-generic options the `BasicCrawler` constructor." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 113 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedBy": [ + { + "name": "BasicCrawlerOptions", + "target": "1554", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A callable responsible for handling requests." + } + ] + }, + "flags": {}, + "groups": [], + "id": 1552, + "module": "crawlers._basic._basic_crawler", + "name": "request_handler", + "parsedDocstring": { + "text": "A callable responsible for handling requests." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 224 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Callable", + "typeArguments": [ + { + "type": "reference", + "name": "[TCrawlingContext]" + }, + { + "type": "reference", + "name": "Awaitable", + "typeArguments": [ + { + "type": "literal", + "value": null + } + ] + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A custom `Statistics` instance, allowing the use of non-default configuration." + } + ] + }, + "flags": {}, + "groups": [], + "id": 1553, + "module": "crawlers._basic._basic_crawler", + "name": "statistics", + "parsedDocstring": { + "text": "A custom `Statistics` instance, allowing the use of non-default configuration." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 231 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Statistics", + "typeArguments": [ + { + "type": "reference", + "name": "TStatisticsState", + "target": "1263" + } + ], + "target": "2707" + } + ] + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Generic options the `BasicCrawler` constructor." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 1552, + 1553 + ], + "title": "Properties" + } + ], + "id": 1551, + "module": "crawlers._basic._basic_crawler", + "name": "_BasicCrawlerOptionsGeneric", + "parsedDocstring": { + "text": "Generic options the `BasicCrawler` constructor." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 221 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedBy": [ + { + "name": "BasicCrawlerOptions", + "target": "1554", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A callable responsible for handling requests." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4029, + "module": "crawlers._basic._basic_crawler", + "name": "request_handler", + "parsedDocstring": { + "text": "A callable responsible for handling requests." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 224 + } + ], + "type": { + "name": "NotRequired[Callable[[TCrawlingContext], Awaitable[None]]]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptionsGeneric.request_handler", + "target": 1552, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A custom `Statistics` instance, allowing the use of non-default configuration." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4030, + "module": "crawlers._basic._basic_crawler", + "name": "statistics", + "parsedDocstring": { + "text": "A custom `Statistics` instance, allowing the use of non-default configuration." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 231 + } + ], + "type": { + "name": "NotRequired[Statistics[TStatisticsState]]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptionsGeneric.statistics", + "target": 1553, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The `Configuration` instance. Some of its properties are used as defaults for the crawler." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4031, + "module": "crawlers._basic._basic_crawler", + "name": "configuration", + "parsedDocstring": { + "text": "The `Configuration` instance. Some of its properties are used as defaults for the crawler." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 116 + } + ], + "type": { + "name": "NotRequired[Configuration]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.configuration", + "target": 1526, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The event manager for managing events for the crawler and all its components." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4032, + "module": "crawlers._basic._basic_crawler", + "name": "event_manager", + "parsedDocstring": { + "text": "The event manager for managing events for the crawler and all its components." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 119 + } + ], + "type": { + "name": "NotRequired[EventManager]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.event_manager", + "target": 1527, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The storage client for managing storages for the crawler and all its components." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4033, + "module": "crawlers._basic._basic_crawler", + "name": "storage_client", + "parsedDocstring": { + "text": "The storage client for managing storages for the crawler and all its components." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 122 + } + ], + "type": { + "name": "NotRequired[StorageClient]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.storage_client", + "target": 1528, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Manager of requests that should be processed by the crawler." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4034, + "module": "crawlers._basic._basic_crawler", + "name": "request_manager", + "parsedDocstring": { + "text": "Manager of requests that should be processed by the crawler." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 125 + } + ], + "type": { + "name": "NotRequired[RequestManager]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.request_manager", + "target": 1529, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A custom `SessionPool` instance, allowing the use of non-default configuration." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4035, + "module": "crawlers._basic._basic_crawler", + "name": "session_pool", + "parsedDocstring": { + "text": "A custom `SessionPool` instance, allowing the use of non-default configuration." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 128 + } + ], + "type": { + "name": "NotRequired[SessionPool]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.session_pool", + "target": 1530, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP proxy configuration used when making requests." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4036, + "module": "crawlers._basic._basic_crawler", + "name": "proxy_configuration", + "parsedDocstring": { + "text": "HTTP proxy configuration used when making requests." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 131 + } + ], + "type": { + "name": "NotRequired[ProxyConfiguration]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.proxy_configuration", + "target": 1531, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP client used by `BasicCrawlingContext.send_request` method." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4037, + "module": "crawlers._basic._basic_crawler", + "name": "http_client", + "parsedDocstring": { + "text": "HTTP client used by `BasicCrawlingContext.send_request` method." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 134 + } + ], + "type": { + "name": "NotRequired[HttpClient]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.http_client", + "target": 1532, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Specifies the maximum number of retries allowed for a request if its processing fails.\nThis includes retries due to navigation errors or errors thrown from user-supplied functions\n(`request_handler`, `pre_navigation_hooks` etc.).\n\nThis limit does not apply to retries triggered by session rotation (see `max_session_rotations`)." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4038, + "module": "crawlers._basic._basic_crawler", + "name": "max_request_retries", + "parsedDocstring": { + "text": "Specifies the maximum number of retries allowed for a request if its processing fails.\nThis includes retries due to navigation errors or errors thrown from user-supplied functions\n(`request_handler`, `pre_navigation_hooks` etc.).\n\nThis limit does not apply to retries triggered by session rotation (see `max_session_rotations`)." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 137 + } + ], + "type": { + "name": "NotRequired[int]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.max_request_retries", + "target": 1533, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum number of pages to open during a crawl. The crawl stops upon reaching this limit.\nSetting this value can help avoid infinite loops in misconfigured crawlers. `None` means no limit.\nDue to concurrency settings, the actual number of pages visited may slightly exceed this value." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4039, + "module": "crawlers._basic._basic_crawler", + "name": "max_requests_per_crawl", + "parsedDocstring": { + "text": "Maximum number of pages to open during a crawl. The crawl stops upon reaching this limit.\nSetting this value can help avoid infinite loops in misconfigured crawlers. `None` means no limit.\nDue to concurrency settings, the actual number of pages visited may slightly exceed this value." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 144 + } + ], + "type": { + "name": "NotRequired[int | None]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.max_requests_per_crawl", + "target": 1534, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum number of session rotations per request. The crawler rotates the session if a proxy error occurs\nor if the website blocks the request.\n\nThe session rotations are not counted towards the `max_request_retries` limit." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4040, + "module": "crawlers._basic._basic_crawler", + "name": "max_session_rotations", + "parsedDocstring": { + "text": "Maximum number of session rotations per request. The crawler rotates the session if a proxy error occurs\nor if the website blocks the request.\n\nThe session rotations are not counted towards the `max_request_retries` limit." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 149 + } + ], + "type": { + "name": "NotRequired[int]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.max_session_rotations", + "target": 1535, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Specifies the maximum crawl depth. If set, the crawler will stop processing links beyond this depth.\nThe crawl depth starts at 0 for initial requests and increases with each subsequent level of links.\nRequests at the maximum depth will still be processed, but no new links will be enqueued from those requests.\nIf not set, crawling continues without depth restrictions." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4041, + "module": "crawlers._basic._basic_crawler", + "name": "max_crawl_depth", + "parsedDocstring": { + "text": "Specifies the maximum crawl depth. If set, the crawler will stop processing links beyond this depth.\nThe crawl depth starts at 0 for initial requests and increases with each subsequent level of links.\nRequests at the maximum depth will still be processed, but no new links will be enqueued from those requests.\nIf not set, crawling continues without depth restrictions." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 156 + } + ], + "type": { + "name": "NotRequired[int | None]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.max_crawl_depth", + "target": 1536, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Enable the use of a session pool for managing sessions during crawling." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4042, + "module": "crawlers._basic._basic_crawler", + "name": "use_session_pool", + "parsedDocstring": { + "text": "Enable the use of a session pool for managing sessions during crawling." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 163 + } + ], + "type": { + "name": "NotRequired[bool]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.use_session_pool", + "target": 1537, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, the crawler attempts to bypass bot protections automatically." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4043, + "module": "crawlers._basic._basic_crawler", + "name": "retry_on_blocked", + "parsedDocstring": { + "text": "If True, the crawler attempts to bypass bot protections automatically." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 166 + } + ], + "type": { + "name": "NotRequired[bool]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.retry_on_blocked", + "target": 1538, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Settings to fine-tune concurrency levels." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4044, + "module": "crawlers._basic._basic_crawler", + "name": "concurrency_settings", + "parsedDocstring": { + "text": "Settings to fine-tune concurrency levels." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 169 + } + ], + "type": { + "name": "NotRequired[ConcurrencySettings]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.concurrency_settings", + "target": 1539, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum duration allowed for a single request handler to run." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4045, + "module": "crawlers._basic._basic_crawler", + "name": "request_handler_timeout", + "parsedDocstring": { + "text": "Maximum duration allowed for a single request handler to run." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 172 + } + ], + "type": { + "name": "NotRequired[timedelta]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.request_handler_timeout", + "target": 1540, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, the crawler stops immediately when any request handler error occurs." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4046, + "module": "crawlers._basic._basic_crawler", + "name": "abort_on_error", + "parsedDocstring": { + "text": "If True, the crawler stops immediately when any request handler error occurs." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 175 + } + ], + "type": { + "name": "NotRequired[bool]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.abort_on_error", + "target": 1541, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, the crawler will set up logging infrastructure automatically." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4047, + "module": "crawlers._basic._basic_crawler", + "name": "configure_logging", + "parsedDocstring": { + "text": "If True, the crawler will set up logging infrastructure automatically." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 178 + } + ], + "type": { + "name": "NotRequired[bool]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.configure_logging", + "target": 1542, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If 'table', displays crawler statistics as formatted tables in logs. If 'inline', outputs statistics as plain\ntext log messages." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4048, + "module": "crawlers._basic._basic_crawler", + "name": "statistics_log_format", + "parsedDocstring": { + "text": "If 'table', displays crawler statistics as formatted tables in logs. If 'inline', outputs statistics as plain\ntext log messages." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 181 + } + ], + "type": { + "name": "NotRequired[Literal['table', 'inline']]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.statistics_log_format", + "target": 1543, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Flag that can keep crawler running even when there are no requests in queue." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4049, + "module": "crawlers._basic._basic_crawler", + "name": "keep_alive", + "parsedDocstring": { + "text": "Flag that can keep crawler running even when there are no requests in queue." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 186 + } + ], + "type": { + "name": "NotRequired[bool]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.keep_alive", + "target": 1544, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Additional HTTP status codes to treat as errors, triggering automatic retries when encountered." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4050, + "module": "crawlers._basic._basic_crawler", + "name": "additional_http_error_status_codes", + "parsedDocstring": { + "text": "Additional HTTP status codes to treat as errors, triggering automatic retries when encountered." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 189 + } + ], + "type": { + "name": "NotRequired[Iterable[int]]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.additional_http_error_status_codes", + "target": 1545, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP status codes that are typically considered errors but should be treated as successful responses." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4051, + "module": "crawlers._basic._basic_crawler", + "name": "ignore_http_error_status_codes", + "parsedDocstring": { + "text": "HTTP status codes that are typically considered errors but should be treated as successful responses." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 192 + } + ], + "type": { + "name": "NotRequired[Iterable[int]]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.ignore_http_error_status_codes", + "target": 1546, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If set to `True`, the crawler will automatically try to fetch the robots.txt file for each domain,\nand skip those that are not allowed. This also prevents disallowed URLs to be added via `EnqueueLinksFunction`." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4052, + "module": "crawlers._basic._basic_crawler", + "name": "respect_robots_txt_file", + "parsedDocstring": { + "text": "If set to `True`, the crawler will automatically try to fetch the robots.txt file for each domain,\nand skip those that are not allowed. This also prevents disallowed URLs to be added via `EnqueueLinksFunction`." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 203 + } + ], + "type": { + "name": "NotRequired[bool]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.respect_robots_txt_file", + "target": 1547, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Interval for logging the crawler status messages." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4053, + "module": "crawlers._basic._basic_crawler", + "name": "status_message_logging_interval", + "parsedDocstring": { + "text": "Interval for logging the crawler status messages." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 207 + } + ], + "type": { + "name": "NotRequired[timedelta]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.status_message_logging_interval", + "target": 1548, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Allows overriding the default status message. The default status message is provided in the parameters.\nReturning `None` suppresses the status message." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4054, + "module": "crawlers._basic._basic_crawler", + "name": "status_message_callback", + "parsedDocstring": { + "text": "Allows overriding the default status message. The default status message is provided in the parameters.\nReturning `None` suppresses the status message." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 210 + } + ], + "type": { + "name": "NotRequired[ Callable[[StatisticsState, StatisticsState | None, str], Awaitable[str | None]] ]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.status_message_callback", + "target": 1549, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Identifier used for crawler state tracking. Use the same id across multiple crawlers to share state between\nthem." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4055, + "module": "crawlers._basic._basic_crawler", + "name": "id", + "parsedDocstring": { + "text": "Identifier used for crawler state tracking. Use the same id across multiple crawlers to share state between\nthem." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 216 + } + ], + "type": { + "name": "NotRequired[int]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.id", + "target": 1550, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Arguments for the `BasicCrawler` constructor.\n\nIt is intended for typing forwarded `__init__` arguments in the subclasses." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 4046, + 4050, + 4044, + 4031, + 4047, + 4032, + 4037, + 4055, + 4051, + 4049, + 4041, + 4038, + 4039, + 4040, + 4036, + 4029, + 4045, + 4034, + 4052, + 4043, + 4035, + 4030, + 4048, + 4054, + 4053, + 4033, + 4042 + ], + "title": "Properties" + } + ], + "id": 1554, + "module": "crawlers._basic._basic_crawler", + "name": "BasicCrawlerOptions", + "parsedDocstring": { + "text": "Arguments for the `BasicCrawler` constructor.\n\nIt is intended for typing forwarded `__init__` arguments in the subclasses." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 235 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "_BasicCrawlerOptionsGeneric", + "target": "1551", + "type": "reference" + }, + { + "name": "_BasicCrawlerOptions", + "target": "1525", + "type": "reference" + } + ], + "extendedBy": [ + { + "name": "HttpCrawlerOptions", + "target": "1264", + "type": "reference" + }, + { + "name": "PlaywrightCrawlerOptions", + "target": "1841", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1556, + "module": "crawlers._basic._basic_crawler", + "name": "__init__", + "parsedDocstring": { + "text": "Initialize a new instance.\n", + "args": { + "configuration": "The `Configuration` instance. Some of its properties are used as defaults for the crawler.", + "event_manager": "The event manager for managing events for the crawler and all its components.", + "storage_client": "The storage client for managing storages for the crawler and all its components.", + "request_manager": "Manager of requests that should be processed by the crawler.", + "session_pool": "A custom `SessionPool` instance, allowing the use of non-default configuration.", + "proxy_configuration": "HTTP proxy configuration used when making requests.", + "http_client": "HTTP client used by `BasicCrawlingContext.send_request` method.", + "request_handler": "A callable responsible for handling requests.", + "max_request_retries": "Specifies the maximum number of retries allowed for a request if its processing fails.\nThis includes retries due to navigation errors or errors thrown from user-supplied functions\n(`request_handler`, `pre_navigation_hooks` etc.).\nThis limit does not apply to retries triggered by session rotation (see `max_session_rotations`).", + "max_requests_per_crawl": "Maximum number of pages to open during a crawl. The crawl stops upon reaching\nthis limit. Setting this value can help avoid infinite loops in misconfigured crawlers. `None` means\nno limit. Due to concurrency settings, the actual number of pages visited may slightly exceed\nthis value. If used together with `keep_alive`, then the crawler will be kept alive only until\n`max_requests_per_crawl` is achieved.", + "max_session_rotations": "Maximum number of session rotations per request. The crawler rotates the session\nif a proxy error occurs or if the website blocks the request.\nThe session rotations are not counted towards the `max_request_retries` limit.", + "max_crawl_depth": "Specifies the maximum crawl depth. If set, the crawler will stop processing links beyond\nthis depth. The crawl depth starts at 0 for initial requests and increases with each subsequent level\nof links. Requests at the maximum depth will still be processed, but no new links will be enqueued\nfrom those requests. If not set, crawling continues without depth restrictions.", + "use_session_pool": "Enable the use of a session pool for managing sessions during crawling.", + "retry_on_blocked": "If True, the crawler attempts to bypass bot protections automatically.", + "additional_http_error_status_codes": "Additional HTTP status codes to treat as errors,\ntriggering automatic retries when encountered.", + "ignore_http_error_status_codes": "HTTP status codes that are typically considered errors but should be treated\nas successful responses.", + "concurrency_settings": "Settings to fine-tune concurrency levels.", + "request_handler_timeout": "Maximum duration allowed for a single request handler to run.", + "statistics": "A custom `Statistics` instance, allowing the use of non-default configuration.", + "abort_on_error": "If True, the crawler stops immediately when any request handler error occurs.", + "keep_alive": "If True, it will keep crawler alive even if there are no requests in queue.\nUse `crawler.stop()` to exit the crawler.", + "configure_logging": "If True, the crawler will set up logging infrastructure automatically.", + "statistics_log_format": "If 'table', displays crawler statistics as formatted tables in logs. If 'inline',\noutputs statistics as plain text log messages.", + "respect_robots_txt_file": "If set to `True`, the crawler will automatically try to fetch the robots.txt file\nfor each domain, and skip those that are not allowed. This also prevents disallowed URLs to be added\nvia `EnqueueLinksFunction`", + "status_message_logging_interval": "Interval for logging the crawler status messages.", + "status_message_callback": "Allows overriding the default status message. The default status message is\nprovided in the parameters. Returning `None` suppresses the status message.", + "id": "Identifier used for crawler state tracking. Use the same id across multiple crawlers to share state\nbetween them.", + "_context_pipeline": "Enables extending the request lifecycle and modifying the crawling context.\nIntended for use by subclasses rather than direct instantiation of `BasicCrawler`.", + "_additional_context_managers": "Additional context managers used throughout the crawler lifecycle.\nIntended for use by subclasses rather than direct instantiation of `BasicCrawler`.", + "_logger": "A logger instance, typically provided by a subclass, for consistent logging labels.\nIntended for use by subclasses rather than direct instantiation of `BasicCrawler`." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 275 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n" + } + ] + }, + "flags": {}, + "id": 1557, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The `Configuration` instance. Some of its properties are used as defaults for the crawler." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1558, + "kind": 32768, + "kindString": "Parameter", + "name": "configuration", + "type": { + "name": "Configuration | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Configuration", + "target": "235" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The event manager for managing events for the crawler and all its components." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1559, + "kind": 32768, + "kindString": "Parameter", + "name": "event_manager", + "type": { + "name": "EventManager | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "EventManager", + "target": "1907" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The storage client for managing storages for the crawler and all its components." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1560, + "kind": 32768, + "kindString": "Parameter", + "name": "storage_client", + "type": { + "name": "StorageClient | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "StorageClient", + "target": "2784" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Manager of requests that should be processed by the crawler." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1561, + "kind": 32768, + "kindString": "Parameter", + "name": "request_manager", + "type": { + "name": "RequestManager | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "RequestManager", + "target": "2339" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A custom `SessionPool` instance, allowing the use of non-default configuration." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1562, + "kind": 32768, + "kindString": "Parameter", + "name": "session_pool", + "type": { + "name": "SessionPool | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "SessionPool", + "target": "2498" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP proxy configuration used when making requests." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1563, + "kind": 32768, + "kindString": "Parameter", + "name": "proxy_configuration", + "type": { + "name": "ProxyConfiguration | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ProxyConfiguration", + "target": "263" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP client used by `BasicCrawlingContext.send_request` method." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1564, + "kind": 32768, + "kindString": "Parameter", + "name": "http_client", + "type": { + "name": "HttpClient | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "HttpClient", + "target": "2070" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A callable responsible for handling requests." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1565, + "kind": 32768, + "kindString": "Parameter", + "name": "request_handler", + "type": { + "name": "Callable[[TCrawlingContext], Awaitable[None]] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Callable", + "typeArguments": [ + { + "type": "reference", + "name": "[TCrawlingContext]" + }, + { + "type": "reference", + "name": "Awaitable", + "typeArguments": [ + { + "type": "literal", + "value": null + } + ] + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Specifies the maximum number of retries allowed for a request if its processing fails.\nThis includes retries due to navigation errors or errors thrown from user-supplied functions\n(`request_handler`, `pre_navigation_hooks` etc.).\nThis limit does not apply to retries triggered by session rotation (see `max_session_rotations`)." + } + ] + }, + "defaultValue": "3", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1566, + "kind": 32768, + "kindString": "Parameter", + "name": "max_request_retries", + "type": { + "name": "int", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum number of pages to open during a crawl. The crawl stops upon reaching\nthis limit. Setting this value can help avoid infinite loops in misconfigured crawlers. `None` means\nno limit. Due to concurrency settings, the actual number of pages visited may slightly exceed\nthis value. If used together with `keep_alive`, then the crawler will be kept alive only until\n`max_requests_per_crawl` is achieved." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1567, + "kind": 32768, + "kindString": "Parameter", + "name": "max_requests_per_crawl", + "type": { + "name": "int | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum number of session rotations per request. The crawler rotates the session\nif a proxy error occurs or if the website blocks the request.\nThe session rotations are not counted towards the `max_request_retries` limit." + } + ] + }, + "defaultValue": "10", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1568, + "kind": 32768, + "kindString": "Parameter", + "name": "max_session_rotations", + "type": { + "name": "int", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Specifies the maximum crawl depth. If set, the crawler will stop processing links beyond\nthis depth. The crawl depth starts at 0 for initial requests and increases with each subsequent level\nof links. Requests at the maximum depth will still be processed, but no new links will be enqueued\nfrom those requests. If not set, crawling continues without depth restrictions." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1569, + "kind": 32768, + "kindString": "Parameter", + "name": "max_crawl_depth", + "type": { + "name": "int | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Enable the use of a session pool for managing sessions during crawling." + } + ] + }, + "defaultValue": "True", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1570, + "kind": 32768, + "kindString": "Parameter", + "name": "use_session_pool", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, the crawler attempts to bypass bot protections automatically." + } + ] + }, + "defaultValue": "True", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1571, + "kind": 32768, + "kindString": "Parameter", + "name": "retry_on_blocked", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Additional HTTP status codes to treat as errors,\ntriggering automatic retries when encountered." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1572, + "kind": 32768, + "kindString": "Parameter", + "name": "additional_http_error_status_codes", + "type": { + "name": "Iterable[int] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Iterable", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP status codes that are typically considered errors but should be treated\nas successful responses." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1573, + "kind": 32768, + "kindString": "Parameter", + "name": "ignore_http_error_status_codes", + "type": { + "name": "Iterable[int] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Iterable", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Settings to fine-tune concurrency levels." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1574, + "kind": 32768, + "kindString": "Parameter", + "name": "concurrency_settings", + "type": { + "name": "ConcurrencySettings | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ConcurrencySettings", + "target": "328" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum duration allowed for a single request handler to run." + } + ] + }, + "defaultValue": "timedelta(minutes=1)", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1575, + "kind": 32768, + "kindString": "Parameter", + "name": "request_handler_timeout", + "type": { + "name": "timedelta", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A custom `Statistics` instance, allowing the use of non-default configuration." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1576, + "kind": 32768, + "kindString": "Parameter", + "name": "statistics", + "type": { + "name": "Statistics[TStatisticsState] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Statistics", + "typeArguments": [ + { + "type": "reference", + "name": "TStatisticsState", + "target": "1263" + } + ], + "target": "2707" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, the crawler stops immediately when any request handler error occurs." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1577, + "kind": 32768, + "kindString": "Parameter", + "name": "abort_on_error", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, it will keep crawler alive even if there are no requests in queue.\nUse `crawler.stop()` to exit the crawler." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1578, + "kind": 32768, + "kindString": "Parameter", + "name": "keep_alive", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, the crawler will set up logging infrastructure automatically." + } + ] + }, + "defaultValue": "True", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1579, + "kind": 32768, + "kindString": "Parameter", + "name": "configure_logging", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "If 'table', displays crawler statistics as formatted tables in logs. If 'inline',\noutputs statistics as plain text log messages." + } + ] + }, + "defaultValue": "'table'", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1580, + "kind": 32768, + "kindString": "Parameter", + "name": "statistics_log_format", + "type": { + "name": "Literal", + "type": "reference", + "typeArguments": [ + { + "type": "literal", + "value": "table" + }, + { + "type": "literal", + "value": "inline" + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "If set to `True`, the crawler will automatically try to fetch the robots.txt file\nfor each domain, and skip those that are not allowed. This also prevents disallowed URLs to be added\nvia `EnqueueLinksFunction`" + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1581, + "kind": 32768, + "kindString": "Parameter", + "name": "respect_robots_txt_file", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Interval for logging the crawler status messages." + } + ] + }, + "defaultValue": "timedelta(seconds=10)", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1582, + "kind": 32768, + "kindString": "Parameter", + "name": "status_message_logging_interval", + "type": { + "name": "timedelta", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Allows overriding the default status message. The default status message is\nprovided in the parameters. Returning `None` suppresses the status message." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1583, + "kind": 32768, + "kindString": "Parameter", + "name": "status_message_callback", + "type": { + "name": "Callable[[StatisticsState, StatisticsState | None, str], Awaitable[str | None]] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Callable", + "typeArguments": [ + { + "type": "reference", + "name": "[StatisticsState, StatisticsState | None, str]" + }, + { + "type": "reference", + "name": "Awaitable", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + ] + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Identifier used for crawler state tracking. Use the same id across multiple crawlers to share state\nbetween them." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1584, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "int | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Enables extending the request lifecycle and modifying the crawling context.\nIntended for use by subclasses rather than direct instantiation of `BasicCrawler`." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1585, + "kind": 32768, + "kindString": "Parameter", + "name": "_context_pipeline", + "type": { + "name": "ContextPipeline[TCrawlingContext] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ContextPipeline", + "typeArguments": [ + { + "type": "reference", + "name": "TCrawlingContext", + "target": "41" + } + ], + "target": "1495" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Additional context managers used throughout the crawler lifecycle.\nIntended for use by subclasses rather than direct instantiation of `BasicCrawler`." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1586, + "kind": 32768, + "kindString": "Parameter", + "name": "_additional_context_managers", + "type": { + "name": "Sequence[AbstractAsyncContextManager] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Sequence", + "typeArguments": [ + { + "type": "reference", + "name": "AbstractAsyncContextManager" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A logger instance, typically provided by a subclass, for consistent logging labels.\nIntended for use by subclasses rather than direct instantiation of `BasicCrawler`." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1587, + "kind": 32768, + "kindString": "Parameter", + "name": "_logger", + "type": { + "name": "logging.Logger | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "logging.Logger" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "BasicCrawler.__init__", + "target": 1556, + "type": "reference" + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The logger used by the crawler." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 1588, + "module": "crawlers._basic._basic_crawler", + "name": "log", + "parsedDocstring": { + "text": "The logger used by the crawler." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 511 + } + ], + "type": { + "name": "logging.Logger", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The `Router` used to handle each individual crawling request." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 1589, + "module": "crawlers._basic._basic_crawler", + "name": "router", + "parsedDocstring": { + "text": "The `Router` used to handle each individual crawling request." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 516 + } + ], + "type": { + "name": "Router", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "TCrawlingContext", + "target": "41" + } + ], + "target": "78" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "args": ".setter", + "name": "router" + } + ], + "flags": {}, + "groups": [], + "id": 1590, + "module": "crawlers._basic._basic_crawler", + "name": "router", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 524 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 1591, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "router", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1592, + "kind": 32768, + "kindString": "Parameter", + "name": "router", + "type": { + "name": "Router", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "TCrawlingContext", + "target": "41" + } + ], + "target": "78" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Statistics about the current (or last) crawler run." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 1593, + "module": "crawlers._basic._basic_crawler", + "name": "statistics", + "parsedDocstring": { + "text": "Statistics about the current (or last) crawler run." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 531 + } + ], + "type": { + "name": "Statistics", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "TStatisticsState", + "target": "1263" + } + ], + "target": "2707" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Set flag to stop crawler.\n\nThis stops current crawler run regardless of whether all requests were finished.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1594, + "module": "crawlers._basic._basic_crawler", + "name": "stop", + "parsedDocstring": { + "text": "Set flag to stop crawler.\n\nThis stops current crawler run regardless of whether all requests were finished.\n", + "args": { + "reason": "Reason for stopping that will be used in logs." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 535 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Set flag to stop crawler.\n\nThis stops current crawler run regardless of whether all requests were finished.\n" + } + ] + }, + "flags": {}, + "id": 1595, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "stop", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Reason for stopping that will be used in logs." + } + ] + }, + "defaultValue": "'Stop was called externally.'", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1596, + "kind": 32768, + "kindString": "Parameter", + "name": "reason", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the configured request manager. If none is configured, open and return the default request queue." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1597, + "module": "crawlers._basic._basic_crawler", + "name": "get_request_manager", + "parsedDocstring": { + "text": "Return the configured request manager. If none is configured, open and return the default request queue." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 613 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the configured request manager. If none is configured, open and return the default request queue." + } + ] + }, + "flags": {}, + "id": 1598, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_request_manager", + "parameters": [], + "type": { + "name": "RequestManager", + "type": "reference", + "target": "2339" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the `Dataset` with the given ID or name. If none is provided, return the default one." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1599, + "module": "crawlers._basic._basic_crawler", + "name": "get_dataset", + "parsedDocstring": { + "text": "Return the `Dataset` with the given ID or name. If none is provided, return the default one." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 623 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the `Dataset` with the given ID or name. If none is provided, return the default one." + } + ] + }, + "flags": {}, + "id": 1600, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_dataset", + "parameters": [ + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1601, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1602, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1603, + "kind": 32768, + "kindString": "Parameter", + "name": "alias", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "Dataset", + "type": "reference", + "target": "3766" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the `KeyValueStore` with the given ID or name. If none is provided, return the default KVS." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1604, + "module": "crawlers._basic._basic_crawler", + "name": "get_key_value_store", + "parsedDocstring": { + "text": "Return the `KeyValueStore` with the given ID or name. If none is provided, return the default KVS." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 639 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the `KeyValueStore` with the given ID or name. If none is provided, return the default KVS." + } + ] + }, + "flags": {}, + "id": 1605, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_key_value_store", + "parameters": [ + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1606, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1607, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1608, + "kind": 32768, + "kindString": "Parameter", + "name": "alias", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "KeyValueStore", + "type": "reference", + "target": "3700" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a function to handle errors occurring in request handlers.\n\nThe error handler is invoked after a request handler error occurs and before a retry attempt." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1609, + "module": "crawlers._basic._basic_crawler", + "name": "error_handler", + "parsedDocstring": { + "text": "Register a function to handle errors occurring in request handlers.\n\nThe error handler is invoked after a request handler error occurs and before a retry attempt." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 655 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a function to handle errors occurring in request handlers.\n\nThe error handler is invoked after a request handler error occurs and before a retry attempt." + } + ] + }, + "flags": {}, + "id": 1610, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "error_handler", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1611, + "kind": 32768, + "kindString": "Parameter", + "name": "handler", + "type": { + "name": "ErrorHandler", + "type": "reference", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "TCrawlingContext", + "target": "41" + }, + { + "type": "reference", + "name": "BasicCrawlingContext", + "target": "504" + } + ] + } + ], + "target": "1522" + } + } + ], + "type": { + "name": "ErrorHandler", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "TCrawlingContext", + "target": "41" + } + ], + "target": "1522" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a function to handle requests that exceed the maximum retry limit.\n\nThe failed request handler is invoked when a request has failed all retry attempts." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1612, + "module": "crawlers._basic._basic_crawler", + "name": "failed_request_handler", + "parsedDocstring": { + "text": "Register a function to handle requests that exceed the maximum retry limit.\n\nThe failed request handler is invoked when a request has failed all retry attempts." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 665 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a function to handle requests that exceed the maximum retry limit.\n\nThe failed request handler is invoked when a request has failed all retry attempts." + } + ] + }, + "flags": {}, + "id": 1613, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "failed_request_handler", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1614, + "kind": 32768, + "kindString": "Parameter", + "name": "handler", + "type": { + "name": "FailedRequestHandler", + "type": "reference", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "TCrawlingContext", + "target": "41" + }, + { + "type": "reference", + "name": "BasicCrawlingContext", + "target": "504" + } + ] + } + ], + "target": "1523" + } + } + ], + "type": { + "name": "FailedRequestHandler", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "TCrawlingContext", + "target": "41" + } + ], + "target": "1523" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a function to handle skipped requests.\n\nThe skipped request handler is invoked when a request is skipped due to a collision or other reasons." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1615, + "module": "crawlers._basic._basic_crawler", + "name": "on_skipped_request", + "parsedDocstring": { + "text": "Register a function to handle skipped requests.\n\nThe skipped request handler is invoked when a request is skipped due to a collision or other reasons." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 675 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a function to handle skipped requests.\n\nThe skipped request handler is invoked when a request is skipped due to a collision or other reasons." + } + ] + }, + "flags": {}, + "id": 1616, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "on_skipped_request", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1617, + "kind": 32768, + "kindString": "Parameter", + "name": "callback", + "type": { + "name": "SkippedRequestCallback", + "type": "reference", + "target": "1524" + } + } + ], + "type": { + "name": "SkippedRequestCallback", + "type": "reference", + "target": "1524" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Run the crawler until all requests are processed.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1618, + "module": "crawlers._basic._basic_crawler", + "name": "run", + "parsedDocstring": { + "text": "Run the crawler until all requests are processed.\n", + "args": { + "requests": "The requests to be enqueued before the crawler starts.", + "purge_request_queue": "If this is `True` and the crawler is not being run for the first time, the default\nrequest queue will be purged." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 683 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Run the crawler until all requests are processed.\n" + } + ] + }, + "flags": {}, + "id": 1619, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "run", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The requests to be enqueued before the crawler starts." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1620, + "kind": 32768, + "kindString": "Parameter", + "name": "requests", + "type": { + "name": "Sequence[str | Request] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Sequence", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Request", + "target": "150" + } + ] + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "If this is `True` and the crawler is not being run for the first time, the default\nrequest queue will be purged." + } + ] + }, + "defaultValue": "True", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1621, + "kind": 32768, + "kindString": "Parameter", + "name": "purge_request_queue", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "FinalStatistics", + "type": "reference", + "target": "2649" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Add requests to the underlying request manager in batches.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1622, + "module": "crawlers._basic._basic_crawler", + "name": "add_requests", + "parsedDocstring": { + "text": "Add requests to the underlying request manager in batches.\n", + "args": { + "requests": "A list of requests to add to the queue.", + "forefront": "If True, add requests to the forefront of the queue.", + "batch_size": "The number of requests to add in one batch.", + "wait_time_between_batches": "Time to wait between adding batches.", + "wait_for_all_requests_to_be_added": "If True, wait for all requests to be added before returning.", + "wait_for_all_requests_to_be_added_timeout": "Timeout for waiting for all requests to be added." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 795 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Add requests to the underlying request manager in batches.\n" + } + ] + }, + "flags": {}, + "id": 1623, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "add_requests", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A list of requests to add to the queue." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1624, + "kind": 32768, + "kindString": "Parameter", + "name": "requests", + "type": { + "name": "Sequence", + "type": "reference", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Request", + "target": "150" + } + ] + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, add requests to the forefront of the queue." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1625, + "kind": 32768, + "kindString": "Parameter", + "name": "forefront", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The number of requests to add in one batch." + } + ] + }, + "defaultValue": "1000", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1626, + "kind": 32768, + "kindString": "Parameter", + "name": "batch_size", + "type": { + "name": "int", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Time to wait between adding batches." + } + ] + }, + "defaultValue": "timedelta(0)", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1627, + "kind": 32768, + "kindString": "Parameter", + "name": "wait_time_between_batches", + "type": { + "name": "timedelta", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, wait for all requests to be added before returning." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1628, + "kind": 32768, + "kindString": "Parameter", + "name": "wait_for_all_requests_to_be_added", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Timeout for waiting for all requests to be added." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1629, + "kind": 32768, + "kindString": "Parameter", + "name": "wait_for_all_requests_to_be_added_timeout", + "type": { + "name": "timedelta | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "timedelta" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1630, + "module": "crawlers._basic._basic_crawler", + "name": "use_state", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 843 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 1631, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "use_state", + "parameters": [ + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1632, + "kind": 32768, + "kindString": "Parameter", + "name": "default_value", + "type": { + "name": "dict[str, JsonSerializable] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "dict", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "JsonSerializable" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "dict", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "JsonSerializable" + } + ] + }, + "inheritedFrom": { + "name": "BasicCrawler.use_state", + "target": 1630, + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Retrieve data from a `Dataset`.\n\nThis helper method simplifies the process of retrieving data from a `Dataset`. It opens the specified\none and then retrieves the data based on the provided parameters.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1633, + "module": "crawlers._basic._basic_crawler", + "name": "get_data", + "parsedDocstring": { + "text": "Retrieve data from a `Dataset`.\n\nThis helper method simplifies the process of retrieving data from a `Dataset`. It opens the specified\none and then retrieves the data based on the provided parameters.\n", + "args": { + "dataset_id": "The ID of the `Dataset`.", + "dataset_name": "The name of the `Dataset` (global scope, named storage).", + "dataset_alias": "The alias of the `Dataset` (run scope, unnamed storage).", + "kwargs": "Keyword arguments to be passed to the `Dataset.get_data()` method.\n" + }, + "returns": "The retrieved data." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 854 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "The retrieved data." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Retrieve data from a `Dataset`.\n\nThis helper method simplifies the process of retrieving data from a `Dataset`. It opens the specified\none and then retrieves the data based on the provided parameters.\n" + } + ] + }, + "flags": {}, + "id": 1634, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_data", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The ID of the `Dataset`." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1635, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_id", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The name of the `Dataset` (global scope, named storage)." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1636, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The alias of the `Dataset` (run scope, unnamed storage)." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1637, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_alias", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Skips the specified number of items at the start." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 524, + "module": "_types", + "name": "offset", + "parsedDocstring": { + "text": "Skips the specified number of items at the start." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 694 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The maximum number of items to retrieve. Unlimited if None." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 525, + "module": "_types", + "name": "limit", + "parsedDocstring": { + "text": "The maximum number of items to retrieve. Unlimited if None." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 697 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return only non-empty items and excludes hidden fields. Shortcut for `skip_hidden` and `skip_empty`." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 526, + "module": "_types", + "name": "clean", + "parsedDocstring": { + "text": "Return only non-empty items and excludes hidden fields. Shortcut for `skip_hidden` and `skip_empty`." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 700 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Set to True to sort results in descending order." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 527, + "module": "_types", + "name": "desc", + "parsedDocstring": { + "text": "Set to True to sort results in descending order." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 703 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Fields to include in each item. Sorts fields as specified if provided." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 528, + "module": "_types", + "name": "fields", + "parsedDocstring": { + "text": "Fields to include in each item. Sorts fields as specified if provided." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 706 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Fields to exclude from each item." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 529, + "module": "_types", + "name": "omit", + "parsedDocstring": { + "text": "Fields to exclude from each item." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 709 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Unwinds items by a specified array field, turning each element into a separate item." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 530, + "module": "_types", + "name": "unwind", + "parsedDocstring": { + "text": "Unwinds items by a specified array field, turning each element into a separate item." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 712 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Excludes empty items from the results if True." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 531, + "module": "_types", + "name": "skip_empty", + "parsedDocstring": { + "text": "Excludes empty items from the results if True." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 715 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Excludes fields starting with '#' if True." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 532, + "module": "_types", + "name": "skip_hidden", + "parsedDocstring": { + "text": "Excludes fields starting with '#' if True." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 718 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Fields to be flattened in returned items." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 533, + "module": "_types", + "name": "flatten", + "parsedDocstring": { + "text": "Fields to be flattened in returned items." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 721 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Specifies the dataset view to be used." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 534, + "module": "_types", + "name": "view", + "parsedDocstring": { + "text": "Specifies the dataset view to be used." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 724 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + } + ], + "type": { + "name": "DatasetItemsListPage", + "type": "reference", + "target": "3658" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Export all items from a Dataset to a JSON or CSV file.\n\nThis method simplifies the process of exporting data collected during crawling. It automatically\ndetermines the export format based on the file extension (`.json` or `.csv`) and handles\nthe conversion of `Dataset` items to the appropriate format.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1639, + "module": "crawlers._basic._basic_crawler", + "name": "export_data", + "parsedDocstring": { + "text": "Export all items from a Dataset to a JSON or CSV file.\n\nThis method simplifies the process of exporting data collected during crawling. It automatically\ndetermines the export format based on the file extension (`.json` or `.csv`) and handles\nthe conversion of `Dataset` items to the appropriate format.\n", + "args": { + "path": "The destination file path. Must end with '.json' or '.csv'.", + "dataset_id": "The ID of the Dataset to export from.", + "dataset_name": "The name of the Dataset to export from (global scope, named storage).", + "dataset_alias": "The alias of the Dataset to export from (run scope, unnamed storage).", + "additional_kwargs": "Extra keyword arguments forwarded to the JSON/CSV exporter depending on the file format." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 884 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Export all items from a Dataset to a JSON or CSV file.\n\nThis method simplifies the process of exporting data collected during crawling. It automatically\ndetermines the export format based on the file extension (`.json` or `.csv`) and handles\nthe conversion of `Dataset` items to the appropriate format.\n" + } + ] + }, + "flags": {}, + "id": 1640, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "export_data", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The destination file path. Must end with '.json' or '.csv'." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1641, + "kind": 32768, + "kindString": "Parameter", + "name": "path", + "type": { + "name": "str | Path", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Path" + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The ID of the Dataset to export from." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1642, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_id", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The name of the Dataset to export from (global scope, named storage)." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1643, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The alias of the Dataset to export from (run scope, unnamed storage)." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1644, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_alias", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A basic web crawler providing a framework for crawling websites.\n\nThe `BasicCrawler` provides a low-level functionality for crawling websites, allowing users to define their\nown page download and data extraction logic. It is designed mostly to be subclassed by crawlers with specific\npurposes. In most cases, you will want to use a more specialized crawler, such as `HttpCrawler`,\n`BeautifulSoupCrawler`, `ParselCrawler`, or `PlaywrightCrawler`. If you are an advanced user and want full\ncontrol over the crawling process, you can subclass the `BasicCrawler` and implement the request-handling logic\nyourself.\n\nThe crawling process begins with URLs provided by a `RequestProvider` instance. Each request is then\nhandled by a user-defined `request_handler` function, which processes the page and extracts the data.\n\nThe `BasicCrawler` includes several common features for crawling, such as:\n- automatic scaling based on the system resources,\n- retries for failed requests,\n- session management,\n- statistics tracking,\n- request routing via labels,\n- proxy rotation,\n- direct storage interaction helpers,\n- and more." + } + ] + }, + "decorations": [ + { + "args": "('Crawlers')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 1556, + 1622, + 1609, + 1639, + 1612, + 1633, + 1599, + 1604, + 1597, + 1615, + 1590, + 1618, + 1594, + 1630 + ], + "title": "Methods" + }, + { + "children": [ + 1588, + 1589, + 1593 + ], + "title": "Properties" + } + ], + "id": 1555, + "module": "crawlers._basic._basic_crawler", + "name": "BasicCrawler", + "parsedDocstring": { + "text": "A basic web crawler providing a framework for crawling websites.\n\nThe `BasicCrawler` provides a low-level functionality for crawling websites, allowing users to define their\nown page download and data extraction logic. It is designed mostly to be subclassed by crawlers with specific\npurposes. In most cases, you will want to use a more specialized crawler, such as `HttpCrawler`,\n`BeautifulSoupCrawler`, `ParselCrawler`, or `PlaywrightCrawler`. If you are an advanced user and want full\ncontrol over the crawling process, you can subclass the `BasicCrawler` and implement the request-handling logic\nyourself.\n\nThe crawling process begins with URLs provided by a `RequestProvider` instance. Each request is then\nhandled by a user-defined `request_handler` function, which processes the page and extracts the data.\n\nThe `BasicCrawler` includes several common features for crawling, such as:\n- automatic scaling based on the system resources,\n- retries for failed requests,\n- session management,\n- statistics tracking,\n- request routing via labels,\n- proxy rotation,\n- direct storage interaction helpers,\n- and more." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 247 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedBy": [ + { + "name": "AbstractHttpCrawler", + "target": "1266", + "type": "reference" + }, + { + "name": "AdaptivePlaywrightCrawler", + "target": "1395", + "type": "reference" + }, + { + "name": "PlaywrightCrawler", + "target": "1815", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1647, + "module": "crawlers._beautifulsoup._beautifulsoup_crawler", + "name": "__init__", + "parsedDocstring": { + "text": "Initialize a new instance.\n", + "args": { + "parser": "The type of parser that should be used by `BeautifulSoup`.", + "kwargs": "Additional keyword arguments to pass to the underlying `AbstractHttpCrawler`." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 57 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n" + } + ] + }, + "flags": {}, + "id": 1648, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The type of parser that should be used by `BeautifulSoup`." + } + ] + }, + "defaultValue": "'lxml'", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1649, + "kind": 32768, + "kindString": "Parameter", + "name": "parser", + "type": { + "name": "BeautifulSoupParserType", + "type": "reference", + "target": "1684" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Timeout for the HTTP request." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 1265, + "module": "crawlers._abstract_http._abstract_http_crawler", + "name": "navigation_timeout", + "parsedDocstring": { + "text": "Timeout for the HTTP request." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 46 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "timedelta" + }, + { + "type": "literal", + "value": null + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A callable responsible for handling requests." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4189, + "module": "crawlers._basic._basic_crawler", + "name": "request_handler", + "parsedDocstring": { + "text": "A callable responsible for handling requests." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 224 + } + ], + "type": { + "name": "NotRequired[Callable[[TCrawlingContext], Awaitable[None]]]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptionsGeneric.request_handler", + "target": 1552, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A custom `Statistics` instance, allowing the use of non-default configuration." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4190, + "module": "crawlers._basic._basic_crawler", + "name": "statistics", + "parsedDocstring": { + "text": "A custom `Statistics` instance, allowing the use of non-default configuration." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 231 + } + ], + "type": { + "name": "NotRequired[Statistics[TStatisticsState]]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptionsGeneric.statistics", + "target": 1553, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The `Configuration` instance. Some of its properties are used as defaults for the crawler." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4191, + "module": "crawlers._basic._basic_crawler", + "name": "configuration", + "parsedDocstring": { + "text": "The `Configuration` instance. Some of its properties are used as defaults for the crawler." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 116 + } + ], + "type": { + "name": "NotRequired[Configuration]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.configuration", + "target": 1526, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The event manager for managing events for the crawler and all its components." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4192, + "module": "crawlers._basic._basic_crawler", + "name": "event_manager", + "parsedDocstring": { + "text": "The event manager for managing events for the crawler and all its components." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 119 + } + ], + "type": { + "name": "NotRequired[EventManager]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.event_manager", + "target": 1527, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The storage client for managing storages for the crawler and all its components." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4193, + "module": "crawlers._basic._basic_crawler", + "name": "storage_client", + "parsedDocstring": { + "text": "The storage client for managing storages for the crawler and all its components." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 122 + } + ], + "type": { + "name": "NotRequired[StorageClient]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.storage_client", + "target": 1528, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Manager of requests that should be processed by the crawler." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4194, + "module": "crawlers._basic._basic_crawler", + "name": "request_manager", + "parsedDocstring": { + "text": "Manager of requests that should be processed by the crawler." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 125 + } + ], + "type": { + "name": "NotRequired[RequestManager]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.request_manager", + "target": 1529, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A custom `SessionPool` instance, allowing the use of non-default configuration." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4195, + "module": "crawlers._basic._basic_crawler", + "name": "session_pool", + "parsedDocstring": { + "text": "A custom `SessionPool` instance, allowing the use of non-default configuration." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 128 + } + ], + "type": { + "name": "NotRequired[SessionPool]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.session_pool", + "target": 1530, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP proxy configuration used when making requests." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4196, + "module": "crawlers._basic._basic_crawler", + "name": "proxy_configuration", + "parsedDocstring": { + "text": "HTTP proxy configuration used when making requests." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 131 + } + ], + "type": { + "name": "NotRequired[ProxyConfiguration]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.proxy_configuration", + "target": 1531, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP client used by `BasicCrawlingContext.send_request` method." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4197, + "module": "crawlers._basic._basic_crawler", + "name": "http_client", + "parsedDocstring": { + "text": "HTTP client used by `BasicCrawlingContext.send_request` method." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 134 + } + ], + "type": { + "name": "NotRequired[HttpClient]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.http_client", + "target": 1532, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Specifies the maximum number of retries allowed for a request if its processing fails.\nThis includes retries due to navigation errors or errors thrown from user-supplied functions\n(`request_handler`, `pre_navigation_hooks` etc.).\n\nThis limit does not apply to retries triggered by session rotation (see `max_session_rotations`)." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4198, + "module": "crawlers._basic._basic_crawler", + "name": "max_request_retries", + "parsedDocstring": { + "text": "Specifies the maximum number of retries allowed for a request if its processing fails.\nThis includes retries due to navigation errors or errors thrown from user-supplied functions\n(`request_handler`, `pre_navigation_hooks` etc.).\n\nThis limit does not apply to retries triggered by session rotation (see `max_session_rotations`)." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 137 + } + ], + "type": { + "name": "NotRequired[int]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.max_request_retries", + "target": 1533, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum number of pages to open during a crawl. The crawl stops upon reaching this limit.\nSetting this value can help avoid infinite loops in misconfigured crawlers. `None` means no limit.\nDue to concurrency settings, the actual number of pages visited may slightly exceed this value." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4199, + "module": "crawlers._basic._basic_crawler", + "name": "max_requests_per_crawl", + "parsedDocstring": { + "text": "Maximum number of pages to open during a crawl. The crawl stops upon reaching this limit.\nSetting this value can help avoid infinite loops in misconfigured crawlers. `None` means no limit.\nDue to concurrency settings, the actual number of pages visited may slightly exceed this value." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 144 + } + ], + "type": { + "name": "NotRequired[int | None]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.max_requests_per_crawl", + "target": 1534, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum number of session rotations per request. The crawler rotates the session if a proxy error occurs\nor if the website blocks the request.\n\nThe session rotations are not counted towards the `max_request_retries` limit." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4200, + "module": "crawlers._basic._basic_crawler", + "name": "max_session_rotations", + "parsedDocstring": { + "text": "Maximum number of session rotations per request. The crawler rotates the session if a proxy error occurs\nor if the website blocks the request.\n\nThe session rotations are not counted towards the `max_request_retries` limit." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 149 + } + ], + "type": { + "name": "NotRequired[int]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.max_session_rotations", + "target": 1535, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Specifies the maximum crawl depth. If set, the crawler will stop processing links beyond this depth.\nThe crawl depth starts at 0 for initial requests and increases with each subsequent level of links.\nRequests at the maximum depth will still be processed, but no new links will be enqueued from those requests.\nIf not set, crawling continues without depth restrictions." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4201, + "module": "crawlers._basic._basic_crawler", + "name": "max_crawl_depth", + "parsedDocstring": { + "text": "Specifies the maximum crawl depth. If set, the crawler will stop processing links beyond this depth.\nThe crawl depth starts at 0 for initial requests and increases with each subsequent level of links.\nRequests at the maximum depth will still be processed, but no new links will be enqueued from those requests.\nIf not set, crawling continues without depth restrictions." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 156 + } + ], + "type": { + "name": "NotRequired[int | None]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.max_crawl_depth", + "target": 1536, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Enable the use of a session pool for managing sessions during crawling." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4202, + "module": "crawlers._basic._basic_crawler", + "name": "use_session_pool", + "parsedDocstring": { + "text": "Enable the use of a session pool for managing sessions during crawling." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 163 + } + ], + "type": { + "name": "NotRequired[bool]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.use_session_pool", + "target": 1537, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, the crawler attempts to bypass bot protections automatically." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4203, + "module": "crawlers._basic._basic_crawler", + "name": "retry_on_blocked", + "parsedDocstring": { + "text": "If True, the crawler attempts to bypass bot protections automatically." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 166 + } + ], + "type": { + "name": "NotRequired[bool]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.retry_on_blocked", + "target": 1538, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Settings to fine-tune concurrency levels." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4204, + "module": "crawlers._basic._basic_crawler", + "name": "concurrency_settings", + "parsedDocstring": { + "text": "Settings to fine-tune concurrency levels." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 169 + } + ], + "type": { + "name": "NotRequired[ConcurrencySettings]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.concurrency_settings", + "target": 1539, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum duration allowed for a single request handler to run." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4205, + "module": "crawlers._basic._basic_crawler", + "name": "request_handler_timeout", + "parsedDocstring": { + "text": "Maximum duration allowed for a single request handler to run." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 172 + } + ], + "type": { + "name": "NotRequired[timedelta]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.request_handler_timeout", + "target": 1540, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, the crawler stops immediately when any request handler error occurs." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4206, + "module": "crawlers._basic._basic_crawler", + "name": "abort_on_error", + "parsedDocstring": { + "text": "If True, the crawler stops immediately when any request handler error occurs." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 175 + } + ], + "type": { + "name": "NotRequired[bool]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.abort_on_error", + "target": 1541, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, the crawler will set up logging infrastructure automatically." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4207, + "module": "crawlers._basic._basic_crawler", + "name": "configure_logging", + "parsedDocstring": { + "text": "If True, the crawler will set up logging infrastructure automatically." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 178 + } + ], + "type": { + "name": "NotRequired[bool]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.configure_logging", + "target": 1542, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If 'table', displays crawler statistics as formatted tables in logs. If 'inline', outputs statistics as plain\ntext log messages." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4208, + "module": "crawlers._basic._basic_crawler", + "name": "statistics_log_format", + "parsedDocstring": { + "text": "If 'table', displays crawler statistics as formatted tables in logs. If 'inline', outputs statistics as plain\ntext log messages." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 181 + } + ], + "type": { + "name": "NotRequired[Literal['table', 'inline']]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.statistics_log_format", + "target": 1543, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Flag that can keep crawler running even when there are no requests in queue." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4209, + "module": "crawlers._basic._basic_crawler", + "name": "keep_alive", + "parsedDocstring": { + "text": "Flag that can keep crawler running even when there are no requests in queue." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 186 + } + ], + "type": { + "name": "NotRequired[bool]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.keep_alive", + "target": 1544, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Additional HTTP status codes to treat as errors, triggering automatic retries when encountered." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4210, + "module": "crawlers._basic._basic_crawler", + "name": "additional_http_error_status_codes", + "parsedDocstring": { + "text": "Additional HTTP status codes to treat as errors, triggering automatic retries when encountered." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 189 + } + ], + "type": { + "name": "NotRequired[Iterable[int]]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.additional_http_error_status_codes", + "target": 1545, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP status codes that are typically considered errors but should be treated as successful responses." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4211, + "module": "crawlers._basic._basic_crawler", + "name": "ignore_http_error_status_codes", + "parsedDocstring": { + "text": "HTTP status codes that are typically considered errors but should be treated as successful responses." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 192 + } + ], + "type": { + "name": "NotRequired[Iterable[int]]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.ignore_http_error_status_codes", + "target": 1546, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If set to `True`, the crawler will automatically try to fetch the robots.txt file for each domain,\nand skip those that are not allowed. This also prevents disallowed URLs to be added via `EnqueueLinksFunction`." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4212, + "module": "crawlers._basic._basic_crawler", + "name": "respect_robots_txt_file", + "parsedDocstring": { + "text": "If set to `True`, the crawler will automatically try to fetch the robots.txt file for each domain,\nand skip those that are not allowed. This also prevents disallowed URLs to be added via `EnqueueLinksFunction`." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 203 + } + ], + "type": { + "name": "NotRequired[bool]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.respect_robots_txt_file", + "target": 1547, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Interval for logging the crawler status messages." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4213, + "module": "crawlers._basic._basic_crawler", + "name": "status_message_logging_interval", + "parsedDocstring": { + "text": "Interval for logging the crawler status messages." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 207 + } + ], + "type": { + "name": "NotRequired[timedelta]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.status_message_logging_interval", + "target": 1548, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Allows overriding the default status message. The default status message is provided in the parameters.\nReturning `None` suppresses the status message." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4214, + "module": "crawlers._basic._basic_crawler", + "name": "status_message_callback", + "parsedDocstring": { + "text": "Allows overriding the default status message. The default status message is provided in the parameters.\nReturning `None` suppresses the status message." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 210 + } + ], + "type": { + "name": "NotRequired[ Callable[[StatisticsState, StatisticsState | None, str], Awaitable[str | None]] ]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.status_message_callback", + "target": 1549, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Identifier used for crawler state tracking. Use the same id across multiple crawlers to share state between\nthem." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4215, + "module": "crawlers._basic._basic_crawler", + "name": "id", + "parsedDocstring": { + "text": "Identifier used for crawler state tracking. Use the same id across multiple crawlers to share state between\nthem." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 216 + } + ], + "type": { + "name": "NotRequired[int]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.id", + "target": 1550, + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "AbstractHttpCrawler.__init__", + "target": 1267, + "type": "reference" + } + } + ], + "overwrites": { + "name": "AbstractHttpCrawler.__init__", + "target": 1267, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a specific version of `AbstractHttpCrawler` class.\n\nThis is a convenience factory method for creating a specific `AbstractHttpCrawler` subclass.\nWhile `AbstractHttpCrawler` allows its two generic parameters to be independent,\nthis method simplifies cases where `TParseResult` is used for both generic parameters." + } + ] + }, + "decorations": [ + { + "name": "classmethod" + } + ], + "flags": {}, + "groups": [], + "id": 4135, + "module": "crawlers._abstract_http._abstract_http_crawler", + "name": "create_parsed_http_crawler_class", + "parsedDocstring": { + "text": "Create a specific version of `AbstractHttpCrawler` class.\n\nThis is a convenience factory method for creating a specific `AbstractHttpCrawler` subclass.\nWhile `AbstractHttpCrawler` allows its two generic parameters to be independent,\nthis method simplifies cases where `TParseResult` is used for both generic parameters." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 93 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a specific version of `AbstractHttpCrawler` class.\n\nThis is a convenience factory method for creating a specific `AbstractHttpCrawler` subclass.\nWhile `AbstractHttpCrawler` allows its two generic parameters to be independent,\nthis method simplifies cases where `TParseResult` is used for both generic parameters." + } + ] + }, + "flags": {}, + "id": 1273, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "create_parsed_http_crawler_class", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1274, + "kind": 32768, + "kindString": "Parameter", + "name": "static_parser", + "type": { + "name": "AbstractHttpParser[TParseResult, TSelectResult]", + "type": "reference" + } + } + ], + "type": { + "name": "type[AbstractHttpCrawler[ParsedHttpCrawlingContext[TParseResult], TParseResult, TSelectResult]]", + "type": "reference" + }, + "inheritedFrom": { + "name": "AbstractHttpCrawler.create_parsed_http_crawler_class", + "target": 1272, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "AbstractHttpCrawler.create_parsed_http_crawler_class", + "target": 1272, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a hook to be called before each navigation.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4136, + "module": "crawlers._abstract_http._abstract_http_crawler", + "name": "pre_navigation_hook", + "parsedDocstring": { + "text": "Register a hook to be called before each navigation.\n", + "args": { + "hook": "A coroutine function to be called before each navigation." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 317 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a hook to be called before each navigation.\n" + } + ] + }, + "flags": {}, + "id": 1276, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "pre_navigation_hook", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A coroutine function to be called before each navigation." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1277, + "kind": 32768, + "kindString": "Parameter", + "name": "hook", + "type": { + "name": "Callable[[BasicCrawlingContext], Awaitable[None]]", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "AbstractHttpCrawler.pre_navigation_hook", + "target": 1275, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "AbstractHttpCrawler.pre_navigation_hook", + "target": 1275, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a hook to be called after each navigation.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4137, + "module": "crawlers._abstract_http._abstract_http_crawler", + "name": "post_navigation_hook", + "parsedDocstring": { + "text": "Register a hook to be called after each navigation.\n", + "args": { + "hook": "A coroutine function to be called after each navigation." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 325 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a hook to be called after each navigation.\n" + } + ] + }, + "flags": {}, + "id": 1279, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "post_navigation_hook", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A coroutine function to be called after each navigation." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1280, + "kind": 32768, + "kindString": "Parameter", + "name": "hook", + "type": { + "name": "Callable[[HttpCrawlingContext], Awaitable[None]]", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "AbstractHttpCrawler.post_navigation_hook", + "target": 1278, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "AbstractHttpCrawler.post_navigation_hook", + "target": 1278, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The logger used by the crawler." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 4138, + "module": "crawlers._basic._basic_crawler", + "name": "log", + "parsedDocstring": { + "text": "The logger used by the crawler." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 511 + } + ], + "type": { + "name": "logging.Logger", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.log", + "target": 1588, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The `Router` used to handle each individual crawling request." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 4139, + "module": "crawlers._basic._basic_crawler", + "name": "router", + "parsedDocstring": { + "text": "The `Router` used to handle each individual crawling request." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 516 + } + ], + "type": { + "name": "Router[TCrawlingContext]", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.router", + "target": 1589, + "type": "reference" + }, + "overwrites": { + "name": "BasicCrawler.router", + "target": 1590, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Statistics about the current (or last) crawler run." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 4140, + "module": "crawlers._basic._basic_crawler", + "name": "statistics", + "parsedDocstring": { + "text": "Statistics about the current (or last) crawler run." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 531 + } + ], + "type": { + "name": "Statistics[TStatisticsState]", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.statistics", + "target": 1593, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Set flag to stop crawler.\n\nThis stops current crawler run regardless of whether all requests were finished.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4141, + "module": "crawlers._basic._basic_crawler", + "name": "stop", + "parsedDocstring": { + "text": "Set flag to stop crawler.\n\nThis stops current crawler run regardless of whether all requests were finished.\n", + "args": { + "reason": "Reason for stopping that will be used in logs." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 535 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Set flag to stop crawler.\n\nThis stops current crawler run regardless of whether all requests were finished.\n" + } + ] + }, + "flags": {}, + "id": 1595, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "stop", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Reason for stopping that will be used in logs." + } + ] + }, + "defaultValue": "'Stop was called externally.'", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1596, + "kind": 32768, + "kindString": "Parameter", + "name": "reason", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.stop", + "target": 1594, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.stop", + "target": 1594, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the configured request manager. If none is configured, open and return the default request queue." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4142, + "module": "crawlers._basic._basic_crawler", + "name": "get_request_manager", + "parsedDocstring": { + "text": "Return the configured request manager. If none is configured, open and return the default request queue." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 613 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the configured request manager. If none is configured, open and return the default request queue." + } + ] + }, + "flags": {}, + "id": 1598, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_request_manager", + "parameters": [], + "type": { + "name": "RequestManager", + "type": "reference", + "target": "2339" + }, + "inheritedFrom": { + "name": "BasicCrawler.get_request_manager", + "target": 1597, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.get_request_manager", + "target": 1597, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the `Dataset` with the given ID or name. If none is provided, return the default one." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4143, + "module": "crawlers._basic._basic_crawler", + "name": "get_dataset", + "parsedDocstring": { + "text": "Return the `Dataset` with the given ID or name. If none is provided, return the default one." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 623 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the `Dataset` with the given ID or name. If none is provided, return the default one." + } + ] + }, + "flags": {}, + "id": 1600, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_dataset", + "parameters": [ + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1601, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1602, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1603, + "kind": 32768, + "kindString": "Parameter", + "name": "alias", + "type": { + "name": "str | None", + "type": "reference" + } + } + ], + "type": { + "name": "Dataset", + "type": "reference", + "target": "3766" + }, + "inheritedFrom": { + "name": "BasicCrawler.get_dataset", + "target": 1599, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.get_dataset", + "target": 1599, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the `KeyValueStore` with the given ID or name. If none is provided, return the default KVS." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4144, + "module": "crawlers._basic._basic_crawler", + "name": "get_key_value_store", + "parsedDocstring": { + "text": "Return the `KeyValueStore` with the given ID or name. If none is provided, return the default KVS." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 639 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the `KeyValueStore` with the given ID or name. If none is provided, return the default KVS." + } + ] + }, + "flags": {}, + "id": 1605, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_key_value_store", + "parameters": [ + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1606, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1607, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1608, + "kind": 32768, + "kindString": "Parameter", + "name": "alias", + "type": { + "name": "str | None", + "type": "reference" + } + } + ], + "type": { + "name": "KeyValueStore", + "type": "reference", + "target": "3700" + }, + "inheritedFrom": { + "name": "BasicCrawler.get_key_value_store", + "target": 1604, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.get_key_value_store", + "target": 1604, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a function to handle errors occurring in request handlers.\n\nThe error handler is invoked after a request handler error occurs and before a retry attempt." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4145, + "module": "crawlers._basic._basic_crawler", + "name": "error_handler", + "parsedDocstring": { + "text": "Register a function to handle errors occurring in request handlers.\n\nThe error handler is invoked after a request handler error occurs and before a retry attempt." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 655 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a function to handle errors occurring in request handlers.\n\nThe error handler is invoked after a request handler error occurs and before a retry attempt." + } + ] + }, + "flags": {}, + "id": 1610, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "error_handler", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1611, + "kind": 32768, + "kindString": "Parameter", + "name": "handler", + "type": { + "name": "ErrorHandler[TCrawlingContext | BasicCrawlingContext]", + "type": "reference" + } + } + ], + "type": { + "name": "ErrorHandler[TCrawlingContext]", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.error_handler", + "target": 1609, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.error_handler", + "target": 1609, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a function to handle requests that exceed the maximum retry limit.\n\nThe failed request handler is invoked when a request has failed all retry attempts." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4146, + "module": "crawlers._basic._basic_crawler", + "name": "failed_request_handler", + "parsedDocstring": { + "text": "Register a function to handle requests that exceed the maximum retry limit.\n\nThe failed request handler is invoked when a request has failed all retry attempts." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 665 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a function to handle requests that exceed the maximum retry limit.\n\nThe failed request handler is invoked when a request has failed all retry attempts." + } + ] + }, + "flags": {}, + "id": 1613, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "failed_request_handler", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1614, + "kind": 32768, + "kindString": "Parameter", + "name": "handler", + "type": { + "name": "FailedRequestHandler[TCrawlingContext | BasicCrawlingContext]", + "type": "reference" + } + } + ], + "type": { + "name": "FailedRequestHandler[TCrawlingContext]", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.failed_request_handler", + "target": 1612, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.failed_request_handler", + "target": 1612, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a function to handle skipped requests.\n\nThe skipped request handler is invoked when a request is skipped due to a collision or other reasons." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4147, + "module": "crawlers._basic._basic_crawler", + "name": "on_skipped_request", + "parsedDocstring": { + "text": "Register a function to handle skipped requests.\n\nThe skipped request handler is invoked when a request is skipped due to a collision or other reasons." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 675 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a function to handle skipped requests.\n\nThe skipped request handler is invoked when a request is skipped due to a collision or other reasons." + } + ] + }, + "flags": {}, + "id": 1616, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "on_skipped_request", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1617, + "kind": 32768, + "kindString": "Parameter", + "name": "callback", + "type": { + "name": "SkippedRequestCallback", + "type": "reference", + "target": "1524" + } + } + ], + "type": { + "name": "SkippedRequestCallback", + "type": "reference", + "target": "1524" + }, + "inheritedFrom": { + "name": "BasicCrawler.on_skipped_request", + "target": 1615, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.on_skipped_request", + "target": 1615, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Run the crawler until all requests are processed.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4148, + "module": "crawlers._basic._basic_crawler", + "name": "run", + "parsedDocstring": { + "text": "Run the crawler until all requests are processed.\n", + "args": { + "requests": "The requests to be enqueued before the crawler starts.", + "purge_request_queue": "If this is `True` and the crawler is not being run for the first time, the default\nrequest queue will be purged." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 683 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Run the crawler until all requests are processed.\n" + } + ] + }, + "flags": {}, + "id": 1619, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "run", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The requests to be enqueued before the crawler starts." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1620, + "kind": 32768, + "kindString": "Parameter", + "name": "requests", + "type": { + "name": "Sequence[str | Request] | None", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "If this is `True` and the crawler is not being run for the first time, the default\nrequest queue will be purged." + } + ] + }, + "defaultValue": "True", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1621, + "kind": 32768, + "kindString": "Parameter", + "name": "purge_request_queue", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "FinalStatistics", + "type": "reference", + "target": "2649" + }, + "inheritedFrom": { + "name": "BasicCrawler.run", + "target": 1618, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.run", + "target": 1618, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Add requests to the underlying request manager in batches.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4149, + "module": "crawlers._basic._basic_crawler", + "name": "add_requests", + "parsedDocstring": { + "text": "Add requests to the underlying request manager in batches.\n", + "args": { + "requests": "A list of requests to add to the queue.", + "forefront": "If True, add requests to the forefront of the queue.", + "batch_size": "The number of requests to add in one batch.", + "wait_time_between_batches": "Time to wait between adding batches.", + "wait_for_all_requests_to_be_added": "If True, wait for all requests to be added before returning.", + "wait_for_all_requests_to_be_added_timeout": "Timeout for waiting for all requests to be added." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 795 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Add requests to the underlying request manager in batches.\n" + } + ] + }, + "flags": {}, + "id": 1623, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "add_requests", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A list of requests to add to the queue." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1624, + "kind": 32768, + "kindString": "Parameter", + "name": "requests", + "type": { + "name": "Sequence[str | Request]", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, add requests to the forefront of the queue." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1625, + "kind": 32768, + "kindString": "Parameter", + "name": "forefront", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The number of requests to add in one batch." + } + ] + }, + "defaultValue": "1000", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1626, + "kind": 32768, + "kindString": "Parameter", + "name": "batch_size", + "type": { + "name": "int", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Time to wait between adding batches." + } + ] + }, + "defaultValue": "timedelta(0)", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1627, + "kind": 32768, + "kindString": "Parameter", + "name": "wait_time_between_batches", + "type": { + "name": "timedelta", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, wait for all requests to be added before returning." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1628, + "kind": 32768, + "kindString": "Parameter", + "name": "wait_for_all_requests_to_be_added", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Timeout for waiting for all requests to be added." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1629, + "kind": 32768, + "kindString": "Parameter", + "name": "wait_for_all_requests_to_be_added_timeout", + "type": { + "name": "timedelta | None", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.add_requests", + "target": 1622, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.add_requests", + "target": 1622, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4150, + "module": "crawlers._basic._basic_crawler", + "name": "use_state", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 843 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 1631, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "use_state", + "parameters": [ + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1632, + "kind": 32768, + "kindString": "Parameter", + "name": "default_value", + "type": { + "name": "dict[str, JsonSerializable] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "dict", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "JsonSerializable" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "dict", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "JsonSerializable" + } + ] + }, + "inheritedFrom": { + "name": "BasicCrawler.use_state", + "target": 1630, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.use_state", + "target": 1630, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Retrieve data from a `Dataset`.\n\nThis helper method simplifies the process of retrieving data from a `Dataset`. It opens the specified\none and then retrieves the data based on the provided parameters.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4151, + "module": "crawlers._basic._basic_crawler", + "name": "get_data", + "parsedDocstring": { + "text": "Retrieve data from a `Dataset`.\n\nThis helper method simplifies the process of retrieving data from a `Dataset`. It opens the specified\none and then retrieves the data based on the provided parameters.\n", + "args": { + "dataset_id": "The ID of the `Dataset`.", + "dataset_name": "The name of the `Dataset` (global scope, named storage).", + "dataset_alias": "The alias of the `Dataset` (run scope, unnamed storage).", + "kwargs": "Keyword arguments to be passed to the `Dataset.get_data()` method.\n" + }, + "returns": "The retrieved data." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 854 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "The retrieved data." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Retrieve data from a `Dataset`.\n\nThis helper method simplifies the process of retrieving data from a `Dataset`. It opens the specified\none and then retrieves the data based on the provided parameters.\n" + } + ] + }, + "flags": {}, + "id": 1634, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_data", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The ID of the `Dataset`." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1635, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_id", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The name of the `Dataset` (global scope, named storage)." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1636, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_name", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The alias of the `Dataset` (run scope, unnamed storage)." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1637, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_alias", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Keyword arguments to be passed to the `Dataset.get_data()` method.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1638, + "kind": 32768, + "kindString": "Parameter", + "name": "kwargs", + "type": { + "name": "Unpack[GetDataKwargs]", + "type": "reference" + } + } + ], + "type": { + "name": "DatasetItemsListPage", + "type": "reference", + "target": "3658" + }, + "inheritedFrom": { + "name": "BasicCrawler.get_data", + "target": 1633, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.get_data", + "target": 1633, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Export all items from a Dataset to a JSON or CSV file.\n\nThis method simplifies the process of exporting data collected during crawling. It automatically\ndetermines the export format based on the file extension (`.json` or `.csv`) and handles\nthe conversion of `Dataset` items to the appropriate format.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4152, + "module": "crawlers._basic._basic_crawler", + "name": "export_data", + "parsedDocstring": { + "text": "Export all items from a Dataset to a JSON or CSV file.\n\nThis method simplifies the process of exporting data collected during crawling. It automatically\ndetermines the export format based on the file extension (`.json` or `.csv`) and handles\nthe conversion of `Dataset` items to the appropriate format.\n", + "args": { + "path": "The destination file path. Must end with '.json' or '.csv'.", + "dataset_id": "The ID of the Dataset to export from.", + "dataset_name": "The name of the Dataset to export from (global scope, named storage).", + "dataset_alias": "The alias of the Dataset to export from (run scope, unnamed storage).", + "additional_kwargs": "Extra keyword arguments forwarded to the JSON/CSV exporter depending on the file format." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 884 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Export all items from a Dataset to a JSON or CSV file.\n\nThis method simplifies the process of exporting data collected during crawling. It automatically\ndetermines the export format based on the file extension (`.json` or `.csv`) and handles\nthe conversion of `Dataset` items to the appropriate format.\n" + } + ] + }, + "flags": {}, + "id": 1640, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "export_data", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The destination file path. Must end with '.json' or '.csv'." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1641, + "kind": 32768, + "kindString": "Parameter", + "name": "path", + "type": { + "name": "str | Path", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The ID of the Dataset to export from." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1642, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_id", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The name of the Dataset to export from (global scope, named storage)." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1643, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_name", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The alias of the Dataset to export from (run scope, unnamed storage)." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1644, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_alias", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Extra keyword arguments forwarded to the JSON/CSV exporter depending on the file format." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1645, + "kind": 32768, + "kindString": "Parameter", + "name": "additional_kwargs", + "type": { + "name": "Unpack[ExportDataJsonKwargs | ExportDataCsvKwargs]", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.export_data", + "target": 1639, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.export_data", + "target": 1639, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A web crawler for performing HTTP requests and parsing HTML/XML content.\n\nThe `BeautifulSoupCrawler` builds on top of the `AbstractHttpCrawler`, which means it inherits all of its features.\nIt specifies its own parser `BeautifulSoupParser` which is used to parse `HttpResponse`.\n`BeautifulSoupParser` uses following library for parsing: https://pypi.org/project/beautifulsoup4/\n\nThe HTTP client-based crawlers are ideal for websites that do not require JavaScript execution. However,\nif you need to execute client-side JavaScript, consider using browser-based crawler like the `PlaywrightCrawler`.\n\n### Usage\n\n```python\nfrom crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext\n\ncrawler = BeautifulSoupCrawler()\n\n# Define the default request handler, which will be called for every request.\n@crawler.router.default_handler\nasync def request_handler(context: BeautifulSoupCrawlingContext) -> None:\n context.log.info(f'Processing {context.request.url} ...')\n\n # Extract data from the page.\n data = {\n 'url': context.request.url,\n 'title': context.soup.title.string if context.soup.title else None,\n }\n\n # Push the extracted data to the default dataset.\n await context.push_data(data)\n\nawait crawler.run(['https://crawlee.dev/'])\n```" + } + ] + }, + "decorations": [ + { + "args": "('Crawlers')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 1647, + 4149, + 4135, + 4145, + 4152, + 4146, + 4151, + 4143, + 4144, + 4142, + 4147, + 4137, + 4136, + 4148, + 4141, + 4150 + ], + "title": "Methods" + }, + { + "children": [ + 4138, + 4139, + 4140 + ], + "title": "Properties" + } + ], + "id": 1646, + "module": "crawlers._beautifulsoup._beautifulsoup_crawler", + "name": "BeautifulSoupCrawler", + "parsedDocstring": { + "text": "A web crawler for performing HTTP requests and parsing HTML/XML content.\n\nThe `BeautifulSoupCrawler` builds on top of the `AbstractHttpCrawler`, which means it inherits all of its features.\nIt specifies its own parser `BeautifulSoupParser` which is used to parse `HttpResponse`.\n`BeautifulSoupParser` uses following library for parsing: https://pypi.org/project/beautifulsoup4/\n\nThe HTTP client-based crawlers are ideal for websites that do not require JavaScript execution. However,\nif you need to execute client-side JavaScript, consider using browser-based crawler like the `PlaywrightCrawler`.\n\n### Usage\n\n```python\nfrom crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext\n\ncrawler = BeautifulSoupCrawler()\n\n# Define the default request handler, which will be called for every request.\n@crawler.router.default_handler\nasync def request_handler(context: BeautifulSoupCrawlingContext) -> None:\n context.log.info(f'Processing {context.request.url} ...')\n\n # Extract data from the page.\n data = {\n 'url': context.request.url,\n 'title': context.soup.title.string if context.soup.title else None,\n }\n\n # Push the extracted data to the default dataset.\n await context.push_data(data)\n\nawait crawler.run(['https://crawlee.dev/'])\n```" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 22 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "AbstractHttpCrawler", + "target": "1266", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Convenience alias." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 1652, + "module": "crawlers._beautifulsoup._beautifulsoup_crawling_context", + "name": "soup", + "parsedDocstring": { + "text": "Convenience alias." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 21 + } + ], + "type": { + "name": "BeautifulSoup", + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance from an existing `ParsedHttpCrawlingContext`." + } + ] + }, + "decorations": [ + { + "name": "classmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1653, + "module": "crawlers._beautifulsoup._beautifulsoup_crawling_context", + "name": "from_parsed_http_crawling_context", + "parsedDocstring": { + "text": "Initialize a new instance from an existing `ParsedHttpCrawlingContext`." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 26 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance from an existing `ParsedHttpCrawlingContext`." + } + ] + }, + "flags": {}, + "id": 1654, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "from_parsed_http_crawling_context", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1655, + "kind": 32768, + "kindString": "Parameter", + "name": "context", + "type": { + "name": "ParsedHttpCrawlingContext", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "BeautifulSoup" + } + ], + "target": "1252" + } + } + ], + "type": { + "name": "Self", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Convert the parsed HTML content to newline-separated plain text without tags." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1656, + "module": "crawlers._beautifulsoup._beautifulsoup_crawling_context", + "name": "html_to_text", + "parsedDocstring": { + "text": "Convert the parsed HTML content to newline-separated plain text without tags." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 30 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Convert the parsed HTML content to newline-separated plain text without tags." + } + ] + }, + "flags": {}, + "id": 1657, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "html_to_text", + "parameters": [], + "type": { + "name": "str", + "type": "reference" + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 4355, + "module": "crawlers._abstract_http._http_crawling_context", + "name": "parsed_content", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 40 + } + ], + "type": { + "name": "TParseResult", + "type": "reference", + "target": "1243" + }, + "inheritedFrom": { + "name": "ParsedHttpCrawlingContext.parsed_content", + "target": 1253, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 4356, + "module": "crawlers._abstract_http._http_crawling_context", + "name": "enqueue_links", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 41 + } + ], + "type": { + "name": "EnqueueLinksFunction", + "type": "reference", + "target": "430" + }, + "inheritedFrom": { + "name": "ParsedHttpCrawlingContext.enqueue_links", + "target": 1254, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 4357, + "module": "crawlers._abstract_http._http_crawling_context", + "name": "extract_links", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 42 + } + ], + "type": { + "name": "ExtractLinksFunction", + "type": "reference", + "target": "459" + }, + "inheritedFrom": { + "name": "ParsedHttpCrawlingContext.extract_links", + "target": 1255, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance from an existing `HttpCrawlingContext`." + } + ] + }, + "decorations": [ + { + "name": "classmethod" + } + ], + "flags": {}, + "groups": [], + "id": 4358, + "module": "crawlers._abstract_http._http_crawling_context", + "name": "from_http_crawling_context", + "parsedDocstring": { + "text": "Initialize a new instance from an existing `HttpCrawlingContext`." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 45 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance from an existing `HttpCrawlingContext`." + } + ] + }, + "flags": {}, + "id": 1257, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "from_http_crawling_context", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1258, + "kind": 32768, + "kindString": "Parameter", + "name": "context", + "type": { + "name": "HttpCrawlingContext", + "type": "reference", + "target": "1245" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1259, + "kind": 32768, + "kindString": "Parameter", + "name": "parsed_content", + "type": { + "name": "TParseResult", + "type": "reference", + "target": "1243" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1260, + "kind": 32768, + "kindString": "Parameter", + "name": "enqueue_links", + "type": { + "name": "EnqueueLinksFunction", + "type": "reference", + "target": "430" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1261, + "kind": 32768, + "kindString": "Parameter", + "name": "extract_links", + "type": { + "name": "ExtractLinksFunction", + "type": "reference", + "target": "459" + } + } + ], + "type": { + "name": "Self", + "type": "reference" + }, + "inheritedFrom": { + "name": "ParsedHttpCrawlingContext.from_http_crawling_context", + "target": 1256, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "ParsedHttpCrawlingContext.from_http_crawling_context", + "target": 1256, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance from an existing `BasicCrawlingContext`." + } + ] + }, + "decorations": [ + { + "name": "classmethod" + } + ], + "flags": {}, + "groups": [], + "id": 4359, + "module": "crawlers._abstract_http._http_crawling_context", + "name": "from_basic_crawling_context", + "parsedDocstring": { + "text": "Initialize a new instance from an existing `BasicCrawlingContext`." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 22 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance from an existing `BasicCrawlingContext`." + } + ] + }, + "flags": {}, + "id": 1247, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "from_basic_crawling_context", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1248, + "kind": 32768, + "kindString": "Parameter", + "name": "context", + "type": { + "name": "BasicCrawlingContext", + "type": "reference", + "target": "504" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1249, + "kind": 32768, + "kindString": "Parameter", + "name": "http_response", + "type": { + "name": "HttpResponse", + "type": "reference", + "target": "2060" + } + } + ], + "type": { + "name": "Self", + "type": "reference" + }, + "inheritedFrom": { + "name": "HttpCrawlingContext.from_basic_crawling_context", + "target": 1246, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "HttpCrawlingContext.from_basic_crawling_context", + "target": 1246, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get snapshot of crawled page." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4360, + "module": "crawlers._abstract_http._http_crawling_context", + "name": "get_snapshot", + "parsedDocstring": { + "text": "Get snapshot of crawled page." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 27 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get snapshot of crawled page." + } + ] + }, + "flags": {}, + "id": 1251, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_snapshot", + "parameters": [], + "type": { + "name": "PageSnapshot", + "type": "reference", + "target": "495" + }, + "overwrites": { + "name": "BasicCrawlingContext.get_snapshot", + "target": 514, + "type": "reference" + }, + "inheritedFrom": { + "name": "HttpCrawlingContext.get_snapshot", + "target": 1250, + "type": "reference" + } + } + ], + "overwrites": { + "name": "BasicCrawlingContext.get_snapshot", + "target": 514, + "type": "reference" + }, + "inheritedFrom": { + "name": "HttpCrawlingContext.get_snapshot", + "target": 1250, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The HTTP response received from the server." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4361, + "module": "http_clients._base", + "name": "http_response", + "parsedDocstring": { + "text": "The HTTP response received from the server." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/http_clients/_base.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 70 + } + ], + "type": { + "name": "HttpResponse", + "type": "reference", + "target": "2060" + }, + "inheritedFrom": { + "name": "HttpCrawlingResult.http_response", + "target": 2069, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Request object for the current page being processed." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4362, + "module": "_types", + "name": "request", + "parsedDocstring": { + "text": "Request object for the current page being processed." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 637 + } + ], + "type": { + "name": "Request", + "type": "reference", + "target": "150" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.request", + "target": 505, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Session object for the current page being processed." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4363, + "module": "_types", + "name": "session", + "parsedDocstring": { + "text": "Session object for the current page being processed." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 640 + } + ], + "type": { + "name": "Session | None", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.session", + "target": 506, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Proxy information for the current page being processed." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4364, + "module": "_types", + "name": "proxy_info", + "parsedDocstring": { + "text": "Proxy information for the current page being processed." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 643 + } + ], + "type": { + "name": "ProxyInfo | None", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.proxy_info", + "target": 507, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Send request crawling context helper function." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4365, + "module": "_types", + "name": "send_request", + "parsedDocstring": { + "text": "Send request crawling context helper function." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 646 + } + ], + "type": { + "name": "SendRequestFunction", + "type": "reference", + "target": "488" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.send_request", + "target": 508, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Add requests crawling context helper function." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4366, + "module": "_types", + "name": "add_requests", + "parsedDocstring": { + "text": "Add requests crawling context helper function." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 649 + } + ], + "type": { + "name": "AddRequestsFunction", + "type": "reference", + "target": "422" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.add_requests", + "target": 509, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Push data crawling context helper function." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4367, + "module": "_types", + "name": "push_data", + "parsedDocstring": { + "text": "Push data crawling context helper function." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 652 + } + ], + "type": { + "name": "PushDataFunction", + "type": "reference", + "target": "480" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.push_data", + "target": 510, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Use state crawling context helper function." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4368, + "module": "_types", + "name": "use_state", + "parsedDocstring": { + "text": "Use state crawling context helper function." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 655 + } + ], + "type": { + "name": "UseStateFunction", + "type": "reference", + "target": "500" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.use_state", + "target": 511, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get key-value store crawling context helper function." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4369, + "module": "_types", + "name": "get_key_value_store", + "parsedDocstring": { + "text": "Get key-value store crawling context helper function." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 658 + } + ], + "type": { + "name": "GetKeyValueStoreFromRequestHandlerFunction", + "type": "reference", + "target": "474" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.get_key_value_store", + "target": 512, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Logger instance." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4370, + "module": "_types", + "name": "log", + "parsedDocstring": { + "text": "Logger instance." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 661 + } + ], + "type": { + "name": "logging.Logger", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.log", + "target": 513, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return hash of the context. Each context is considered unique." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4371, + "module": "_types", + "name": "__hash__", + "parsedDocstring": { + "text": "Return hash of the context. Each context is considered unique." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 668 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return hash of the context. Each context is considered unique." + } + ] + }, + "flags": {}, + "id": 517, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__hash__", + "parameters": [], + "type": { + "name": "int", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.__hash__", + "target": 516, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawlingContext.__hash__", + "target": 516, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a modified copy of the crawling context with specified changes." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4372, + "module": "_types", + "name": "create_modified_copy", + "parsedDocstring": { + "text": "Create a modified copy of the crawling context with specified changes." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 672 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a modified copy of the crawling context with specified changes." + } + ] + }, + "flags": {}, + "id": 519, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "create_modified_copy", + "parameters": [ + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 520, + "kind": 32768, + "kindString": "Parameter", + "name": "push_data", + "type": { + "name": "PushDataFunction | None", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 521, + "kind": 32768, + "kindString": "Parameter", + "name": "add_requests", + "type": { + "name": "AddRequestsFunction | None", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 522, + "kind": 32768, + "kindString": "Parameter", + "name": "get_key_value_store", + "type": { + "name": "GetKeyValueStoreFromRequestHandlerFunction | None", + "type": "reference" + } + } + ], + "type": { + "name": "Self", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.create_modified_copy", + "target": 518, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawlingContext.create_modified_copy", + "target": 518, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The crawling context used by the `BeautifulSoupCrawler`.\n\nIt provides access to key objects as well as utility functions for handling crawling tasks." + } + ] + }, + "decorations": [ + { + "args": "(frozen=True)", + "name": "dataclass" + }, + { + "args": "('Crawling contexts')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 4371, + 4372, + 4359, + 4358, + 1653, + 4360, + 1656 + ], + "title": "Methods" + }, + { + "children": [ + 4366, + 4356, + 4357, + 4369, + 4361, + 4370, + 4355, + 4364, + 4367, + 4362, + 4365, + 4363, + 1652, + 4368 + ], + "title": "Properties" + } + ], + "id": 1651, + "module": "crawlers._beautifulsoup._beautifulsoup_crawling_context", + "name": "BeautifulSoupCrawlingContext", + "parsedDocstring": { + "text": "The crawling context used by the `BeautifulSoupCrawler`.\n\nIt provides access to key objects as well as utility functions for handling crawling tasks." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 14 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "ParsedHttpCrawlingContext", + "target": "1252", + "type": "reference" + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Convert markup string or `BeautifulSoup` to newline separated plain text without tags using BeautifulSoup.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1658, + "module": "crawlers._beautifulsoup._utils", + "name": "html_to_text", + "parsedDocstring": { + "text": "Convert markup string or `BeautifulSoup` to newline separated plain text without tags using BeautifulSoup.\n", + "args": { + "source": "Input markup string or `BeautifulSoup` object.\n" + }, + "returns": "Newline separated plain text without tags." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_beautifulsoup/_utils.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 20 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "Newline separated plain text without tags." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Convert markup string or `BeautifulSoup` to newline separated plain text without tags using BeautifulSoup.\n" + } + ] + }, + "flags": {}, + "id": 1659, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "html_to_text", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Input markup string or `BeautifulSoup` object.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1660, + "kind": 32768, + "kindString": "Parameter", + "name": "source", + "type": { + "name": "str | Tag", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Tag" + } + ] + } + } + ], + "type": { + "name": "str", + "type": "reference" + } + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1662, + "module": "crawlers._beautifulsoup._beautifulsoup_parser", + "name": "__init__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 21 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 1663, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "defaultValue": "'lxml'", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1664, + "kind": 32768, + "kindString": "Parameter", + "name": "parser", + "type": { + "name": "BeautifulSoupParserType", + "type": "reference", + "target": "1684" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Parse HTTP response.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1665, + "module": "crawlers._abstract_http._abstract_http_parser", + "name": "parse", + "parsedDocstring": { + "text": "Parse HTTP response.\n", + "args": { + "response": "HTTP response to be parsed.\n" + }, + "returns": "Parsed HTTP response." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 25 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "Parsed HTTP response." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Parse HTTP response.\n" + } + ] + }, + "flags": {}, + "id": 1283, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "parse", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP response to be parsed.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1284, + "kind": 32768, + "kindString": "Parameter", + "name": "response", + "type": { + "name": "HttpResponse", + "type": "reference", + "target": "2060" + } + } + ], + "type": { + "name": "TParseResult", + "type": "reference", + "target": "1243" + }, + "overwrites": { + "name": "AbstractHttpParser.parse", + "target": 1282, + "type": "reference" + } + } + ], + "overwrites": { + "name": "AbstractHttpParser.parse", + "target": 1282, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Parse text containing html.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1668, + "module": "crawlers._abstract_http._abstract_http_parser", + "name": "parse_text", + "parsedDocstring": { + "text": "Parse text containing html.\n", + "args": { + "text": "String containing html.\n" + }, + "returns": "Parsed text." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 29 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "Parsed text." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Parse text containing html.\n" + } + ] + }, + "flags": {}, + "id": 1286, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "parse_text", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "String containing html.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1287, + "kind": 32768, + "kindString": "Parameter", + "name": "text", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "TParseResult", + "type": "reference", + "target": "1243" + }, + "overwrites": { + "name": "AbstractHttpParser.parse_text", + "target": 1285, + "type": "reference" + } + } + ], + "overwrites": { + "name": "AbstractHttpParser.parse_text", + "target": 1285, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Find if selector has match in parsed content.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1671, + "module": "crawlers._abstract_http._abstract_http_parser", + "name": "is_matching_selector", + "parsedDocstring": { + "text": "Find if selector has match in parsed content.\n", + "args": { + "parsed_content": "Parsed HTTP response. Result of `parse` method.", + "selector": "String used to define matching pattern.\n" + }, + "returns": "True if selector has match in parsed content." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 33 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "True if selector has match in parsed content." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Find if selector has match in parsed content.\n" + } + ] + }, + "flags": {}, + "id": 1296, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "is_matching_selector", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Parsed HTTP response. Result of `parse` method." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1297, + "kind": 32768, + "kindString": "Parameter", + "name": "parsed_content", + "type": { + "name": "TParseResult", + "type": "reference", + "target": "1243" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "String used to define matching pattern.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1298, + "kind": 32768, + "kindString": "Parameter", + "name": "selector", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "bool", + "type": "reference" + }, + "overwrites": { + "name": "AbstractHttpParser.is_matching_selector", + "target": 1295, + "type": "reference" + } + } + ], + "overwrites": { + "name": "AbstractHttpParser.is_matching_selector", + "target": 1295, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Use css selector to select page element and return it.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1675, + "module": "crawlers._abstract_http._abstract_http_parser", + "name": "select", + "parsedDocstring": { + "text": "Use css selector to select page element and return it.\n", + "args": { + "parsed_content": "Content where the page element will be located.", + "selector": "Css selector used to locate desired html element.\n" + }, + "returns": "Selected element." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 37 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "Selected element." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Use css selector to select page element and return it.\n" + } + ] + }, + "flags": {}, + "id": 1289, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "select", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Content where the page element will be located." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1290, + "kind": 32768, + "kindString": "Parameter", + "name": "parsed_content", + "type": { + "name": "TParseResult", + "type": "reference", + "target": "1243" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Css selector used to locate desired html element.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1291, + "kind": 32768, + "kindString": "Parameter", + "name": "selector", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "Sequence", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "TSelectResult", + "target": "1244" + } + ] + }, + "overwrites": { + "name": "AbstractHttpParser.select", + "target": 1288, + "type": "reference" + } + } + ], + "overwrites": { + "name": "AbstractHttpParser.select", + "target": 1288, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Find all links in result using selector.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1679, + "module": "crawlers._abstract_http._abstract_http_parser", + "name": "find_links", + "parsedDocstring": { + "text": "Find all links in result using selector.\n", + "args": { + "parsed_content": "Parsed HTTP response. Result of `parse` method.", + "selector": "String used to define matching pattern for finding links.", + "attribute": "Which node attribute to extract the links from.\n" + }, + "returns": "Iterable of strings that contain found links." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 41 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "Iterable of strings that contain found links." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Find all links in result using selector.\n" + } + ] + }, + "flags": {}, + "id": 1300, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "find_links", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Parsed HTTP response. Result of `parse` method." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1301, + "kind": 32768, + "kindString": "Parameter", + "name": "parsed_content", + "type": { + "name": "TParseResult", + "type": "reference", + "target": "1243" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "String used to define matching pattern for finding links." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1302, + "kind": 32768, + "kindString": "Parameter", + "name": "selector", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Which node attribute to extract the links from.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1303, + "kind": 32768, + "kindString": "Parameter", + "name": "attribute", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "Iterable", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + }, + "overwrites": { + "name": "AbstractHttpParser.find_links", + "target": 1299, + "type": "reference" + } + } + ], + "overwrites": { + "name": "AbstractHttpParser.find_links", + "target": 1299, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Detect if blocked and return BlockedInfo with additional information.\n\nDefault implementation that expects `is_matching_selector` abstract method to be implemented.\nOverride this method if your parser has different way of blockage detection.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4027, + "module": "crawlers._abstract_http._abstract_http_parser", + "name": "is_blocked", + "parsedDocstring": { + "text": "Detect if blocked and return BlockedInfo with additional information.\n\nDefault implementation that expects `is_matching_selector` abstract method to be implemented.\nOverride this method if your parser has different way of blockage detection.\n", + "args": { + "parsed_content": "Parsed HTTP response. Result of `parse` method.\n" + }, + "returns": "`BlockedInfo` object that contains non-empty string description of reason if blockage was detected. Empty\nstring in reason signifies no blockage detected." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_abstract_http/_abstract_http_parser.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 56 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "`BlockedInfo` object that contains non-empty string description of reason if blockage was detected. Empty\nstring in reason signifies no blockage detected." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Detect if blocked and return BlockedInfo with additional information.\n\nDefault implementation that expects `is_matching_selector` abstract method to be implemented.\nOverride this method if your parser has different way of blockage detection.\n" + } + ] + }, + "flags": {}, + "id": 1293, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "is_blocked", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Parsed HTTP response. Result of `parse` method.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1294, + "kind": 32768, + "kindString": "Parameter", + "name": "parsed_content", + "type": { + "name": "TParseResult", + "type": "reference", + "target": "1243" + } + } + ], + "type": { + "name": "BlockedInfo", + "type": "reference", + "target": "1848" + }, + "inheritedFrom": { + "name": "AbstractHttpParser.is_blocked", + "target": 1292, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "AbstractHttpParser.is_blocked", + "target": 1292, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Parser for parsing HTTP response using `BeautifulSoup`." + } + ] + }, + "decorations": [ + { + "args": "('HTTP parsers')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 1662, + 1679, + 4027, + 1671, + 1665, + 1668, + 1675 + ], + "title": "Methods" + } + ], + "id": 1661, + "module": "crawlers._beautifulsoup._beautifulsoup_parser", + "name": "BeautifulSoupParser", + "parsedDocstring": { + "text": "Parser for parsing HTTP response using `BeautifulSoup`." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 18 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "AbstractHttpParser", + "target": "1281", + "type": "reference" + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1684, + "module": "crawlers._beautifulsoup._beautifulsoup_parser", + "name": "BeautifulSoupParserType", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 51 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1686, + "module": "crawlers._http._http_crawler", + "name": "__init__", + "parsedDocstring": { + "text": "Initialize a new instance.\n", + "args": { + "kwargs": "Additional keyword arguments to pass to the underlying `AbstractHttpCrawler`." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_http/_http_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 49 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n" + } + ] + }, + "flags": {}, + "id": 1687, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A callable responsible for handling requests." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4029, + "module": "crawlers._basic._basic_crawler", + "name": "request_handler", + "parsedDocstring": { + "text": "A callable responsible for handling requests." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 224 + } + ], + "type": { + "name": "NotRequired[Callable[[TCrawlingContext], Awaitable[None]]]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptionsGeneric.request_handler", + "target": 1552, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A custom `Statistics` instance, allowing the use of non-default configuration." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4030, + "module": "crawlers._basic._basic_crawler", + "name": "statistics", + "parsedDocstring": { + "text": "A custom `Statistics` instance, allowing the use of non-default configuration." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 231 + } + ], + "type": { + "name": "NotRequired[Statistics[TStatisticsState]]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptionsGeneric.statistics", + "target": 1553, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The `Configuration` instance. Some of its properties are used as defaults for the crawler." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4031, + "module": "crawlers._basic._basic_crawler", + "name": "configuration", + "parsedDocstring": { + "text": "The `Configuration` instance. Some of its properties are used as defaults for the crawler." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 116 + } + ], + "type": { + "name": "NotRequired[Configuration]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.configuration", + "target": 1526, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The event manager for managing events for the crawler and all its components." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4032, + "module": "crawlers._basic._basic_crawler", + "name": "event_manager", + "parsedDocstring": { + "text": "The event manager for managing events for the crawler and all its components." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 119 + } + ], + "type": { + "name": "NotRequired[EventManager]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.event_manager", + "target": 1527, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The storage client for managing storages for the crawler and all its components." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4033, + "module": "crawlers._basic._basic_crawler", + "name": "storage_client", + "parsedDocstring": { + "text": "The storage client for managing storages for the crawler and all its components." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 122 + } + ], + "type": { + "name": "NotRequired[StorageClient]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.storage_client", + "target": 1528, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Manager of requests that should be processed by the crawler." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4034, + "module": "crawlers._basic._basic_crawler", + "name": "request_manager", + "parsedDocstring": { + "text": "Manager of requests that should be processed by the crawler." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 125 + } + ], + "type": { + "name": "NotRequired[RequestManager]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.request_manager", + "target": 1529, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A custom `SessionPool` instance, allowing the use of non-default configuration." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4035, + "module": "crawlers._basic._basic_crawler", + "name": "session_pool", + "parsedDocstring": { + "text": "A custom `SessionPool` instance, allowing the use of non-default configuration." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 128 + } + ], + "type": { + "name": "NotRequired[SessionPool]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.session_pool", + "target": 1530, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP proxy configuration used when making requests." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4036, + "module": "crawlers._basic._basic_crawler", + "name": "proxy_configuration", + "parsedDocstring": { + "text": "HTTP proxy configuration used when making requests." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 131 + } + ], + "type": { + "name": "NotRequired[ProxyConfiguration]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.proxy_configuration", + "target": 1531, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP client used by `BasicCrawlingContext.send_request` method." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4037, + "module": "crawlers._basic._basic_crawler", + "name": "http_client", + "parsedDocstring": { + "text": "HTTP client used by `BasicCrawlingContext.send_request` method." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 134 + } + ], + "type": { + "name": "NotRequired[HttpClient]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.http_client", + "target": 1532, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Specifies the maximum number of retries allowed for a request if its processing fails.\nThis includes retries due to navigation errors or errors thrown from user-supplied functions\n(`request_handler`, `pre_navigation_hooks` etc.).\n\nThis limit does not apply to retries triggered by session rotation (see `max_session_rotations`)." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4038, + "module": "crawlers._basic._basic_crawler", + "name": "max_request_retries", + "parsedDocstring": { + "text": "Specifies the maximum number of retries allowed for a request if its processing fails.\nThis includes retries due to navigation errors or errors thrown from user-supplied functions\n(`request_handler`, `pre_navigation_hooks` etc.).\n\nThis limit does not apply to retries triggered by session rotation (see `max_session_rotations`)." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 137 + } + ], + "type": { + "name": "NotRequired[int]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.max_request_retries", + "target": 1533, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum number of pages to open during a crawl. The crawl stops upon reaching this limit.\nSetting this value can help avoid infinite loops in misconfigured crawlers. `None` means no limit.\nDue to concurrency settings, the actual number of pages visited may slightly exceed this value." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4039, + "module": "crawlers._basic._basic_crawler", + "name": "max_requests_per_crawl", + "parsedDocstring": { + "text": "Maximum number of pages to open during a crawl. The crawl stops upon reaching this limit.\nSetting this value can help avoid infinite loops in misconfigured crawlers. `None` means no limit.\nDue to concurrency settings, the actual number of pages visited may slightly exceed this value." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 144 + } + ], + "type": { + "name": "NotRequired[int | None]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.max_requests_per_crawl", + "target": 1534, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum number of session rotations per request. The crawler rotates the session if a proxy error occurs\nor if the website blocks the request.\n\nThe session rotations are not counted towards the `max_request_retries` limit." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4040, + "module": "crawlers._basic._basic_crawler", + "name": "max_session_rotations", + "parsedDocstring": { + "text": "Maximum number of session rotations per request. The crawler rotates the session if a proxy error occurs\nor if the website blocks the request.\n\nThe session rotations are not counted towards the `max_request_retries` limit." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 149 + } + ], + "type": { + "name": "NotRequired[int]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.max_session_rotations", + "target": 1535, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Specifies the maximum crawl depth. If set, the crawler will stop processing links beyond this depth.\nThe crawl depth starts at 0 for initial requests and increases with each subsequent level of links.\nRequests at the maximum depth will still be processed, but no new links will be enqueued from those requests.\nIf not set, crawling continues without depth restrictions." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4041, + "module": "crawlers._basic._basic_crawler", + "name": "max_crawl_depth", + "parsedDocstring": { + "text": "Specifies the maximum crawl depth. If set, the crawler will stop processing links beyond this depth.\nThe crawl depth starts at 0 for initial requests and increases with each subsequent level of links.\nRequests at the maximum depth will still be processed, but no new links will be enqueued from those requests.\nIf not set, crawling continues without depth restrictions." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 156 + } + ], + "type": { + "name": "NotRequired[int | None]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.max_crawl_depth", + "target": 1536, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Enable the use of a session pool for managing sessions during crawling." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4042, + "module": "crawlers._basic._basic_crawler", + "name": "use_session_pool", + "parsedDocstring": { + "text": "Enable the use of a session pool for managing sessions during crawling." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 163 + } + ], + "type": { + "name": "NotRequired[bool]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.use_session_pool", + "target": 1537, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, the crawler attempts to bypass bot protections automatically." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4043, + "module": "crawlers._basic._basic_crawler", + "name": "retry_on_blocked", + "parsedDocstring": { + "text": "If True, the crawler attempts to bypass bot protections automatically." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 166 + } + ], + "type": { + "name": "NotRequired[bool]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.retry_on_blocked", + "target": 1538, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Settings to fine-tune concurrency levels." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4044, + "module": "crawlers._basic._basic_crawler", + "name": "concurrency_settings", + "parsedDocstring": { + "text": "Settings to fine-tune concurrency levels." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 169 + } + ], + "type": { + "name": "NotRequired[ConcurrencySettings]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.concurrency_settings", + "target": 1539, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum duration allowed for a single request handler to run." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4045, + "module": "crawlers._basic._basic_crawler", + "name": "request_handler_timeout", + "parsedDocstring": { + "text": "Maximum duration allowed for a single request handler to run." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 172 + } + ], + "type": { + "name": "NotRequired[timedelta]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.request_handler_timeout", + "target": 1540, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, the crawler stops immediately when any request handler error occurs." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4046, + "module": "crawlers._basic._basic_crawler", + "name": "abort_on_error", + "parsedDocstring": { + "text": "If True, the crawler stops immediately when any request handler error occurs." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 175 + } + ], + "type": { + "name": "NotRequired[bool]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.abort_on_error", + "target": 1541, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, the crawler will set up logging infrastructure automatically." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4047, + "module": "crawlers._basic._basic_crawler", + "name": "configure_logging", + "parsedDocstring": { + "text": "If True, the crawler will set up logging infrastructure automatically." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 178 + } + ], + "type": { + "name": "NotRequired[bool]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.configure_logging", + "target": 1542, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If 'table', displays crawler statistics as formatted tables in logs. If 'inline', outputs statistics as plain\ntext log messages." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4048, + "module": "crawlers._basic._basic_crawler", + "name": "statistics_log_format", + "parsedDocstring": { + "text": "If 'table', displays crawler statistics as formatted tables in logs. If 'inline', outputs statistics as plain\ntext log messages." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 181 + } + ], + "type": { + "name": "NotRequired[Literal['table', 'inline']]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.statistics_log_format", + "target": 1543, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Flag that can keep crawler running even when there are no requests in queue." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4049, + "module": "crawlers._basic._basic_crawler", + "name": "keep_alive", + "parsedDocstring": { + "text": "Flag that can keep crawler running even when there are no requests in queue." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 186 + } + ], + "type": { + "name": "NotRequired[bool]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.keep_alive", + "target": 1544, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Additional HTTP status codes to treat as errors, triggering automatic retries when encountered." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4050, + "module": "crawlers._basic._basic_crawler", + "name": "additional_http_error_status_codes", + "parsedDocstring": { + "text": "Additional HTTP status codes to treat as errors, triggering automatic retries when encountered." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 189 + } + ], + "type": { + "name": "NotRequired[Iterable[int]]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.additional_http_error_status_codes", + "target": 1545, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP status codes that are typically considered errors but should be treated as successful responses." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4051, + "module": "crawlers._basic._basic_crawler", + "name": "ignore_http_error_status_codes", + "parsedDocstring": { + "text": "HTTP status codes that are typically considered errors but should be treated as successful responses." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 192 + } + ], + "type": { + "name": "NotRequired[Iterable[int]]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.ignore_http_error_status_codes", + "target": 1546, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If set to `True`, the crawler will automatically try to fetch the robots.txt file for each domain,\nand skip those that are not allowed. This also prevents disallowed URLs to be added via `EnqueueLinksFunction`." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4052, + "module": "crawlers._basic._basic_crawler", + "name": "respect_robots_txt_file", + "parsedDocstring": { + "text": "If set to `True`, the crawler will automatically try to fetch the robots.txt file for each domain,\nand skip those that are not allowed. This also prevents disallowed URLs to be added via `EnqueueLinksFunction`." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 203 + } + ], + "type": { + "name": "NotRequired[bool]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.respect_robots_txt_file", + "target": 1547, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Interval for logging the crawler status messages." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4053, + "module": "crawlers._basic._basic_crawler", + "name": "status_message_logging_interval", + "parsedDocstring": { + "text": "Interval for logging the crawler status messages." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 207 + } + ], + "type": { + "name": "NotRequired[timedelta]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.status_message_logging_interval", + "target": 1548, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Allows overriding the default status message. The default status message is provided in the parameters.\nReturning `None` suppresses the status message." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4054, + "module": "crawlers._basic._basic_crawler", + "name": "status_message_callback", + "parsedDocstring": { + "text": "Allows overriding the default status message. The default status message is provided in the parameters.\nReturning `None` suppresses the status message." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 210 + } + ], + "type": { + "name": "NotRequired[ Callable[[StatisticsState, StatisticsState | None, str], Awaitable[str | None]] ]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.status_message_callback", + "target": 1549, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Identifier used for crawler state tracking. Use the same id across multiple crawlers to share state between\nthem." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4055, + "module": "crawlers._basic._basic_crawler", + "name": "id", + "parsedDocstring": { + "text": "Identifier used for crawler state tracking. Use the same id across multiple crawlers to share state between\nthem." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 216 + } + ], + "type": { + "name": "NotRequired[int]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.id", + "target": 1550, + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "AbstractHttpCrawler.__init__", + "target": 1267, + "type": "reference" + } + } + ], + "overwrites": { + "name": "AbstractHttpCrawler.__init__", + "target": 1267, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a specific version of `AbstractHttpCrawler` class.\n\nThis is a convenience factory method for creating a specific `AbstractHttpCrawler` subclass.\nWhile `AbstractHttpCrawler` allows its two generic parameters to be independent,\nthis method simplifies cases where `TParseResult` is used for both generic parameters." + } + ] + }, + "decorations": [ + { + "name": "classmethod" + } + ], + "flags": {}, + "groups": [], + "id": 4153, + "module": "crawlers._abstract_http._abstract_http_crawler", + "name": "create_parsed_http_crawler_class", + "parsedDocstring": { + "text": "Create a specific version of `AbstractHttpCrawler` class.\n\nThis is a convenience factory method for creating a specific `AbstractHttpCrawler` subclass.\nWhile `AbstractHttpCrawler` allows its two generic parameters to be independent,\nthis method simplifies cases where `TParseResult` is used for both generic parameters." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 93 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a specific version of `AbstractHttpCrawler` class.\n\nThis is a convenience factory method for creating a specific `AbstractHttpCrawler` subclass.\nWhile `AbstractHttpCrawler` allows its two generic parameters to be independent,\nthis method simplifies cases where `TParseResult` is used for both generic parameters." + } + ] + }, + "flags": {}, + "id": 1273, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "create_parsed_http_crawler_class", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1274, + "kind": 32768, + "kindString": "Parameter", + "name": "static_parser", + "type": { + "name": "AbstractHttpParser[TParseResult, TSelectResult]", + "type": "reference" + } + } + ], + "type": { + "name": "type[AbstractHttpCrawler[ParsedHttpCrawlingContext[TParseResult], TParseResult, TSelectResult]]", + "type": "reference" + }, + "inheritedFrom": { + "name": "AbstractHttpCrawler.create_parsed_http_crawler_class", + "target": 1272, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "AbstractHttpCrawler.create_parsed_http_crawler_class", + "target": 1272, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a hook to be called before each navigation.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4154, + "module": "crawlers._abstract_http._abstract_http_crawler", + "name": "pre_navigation_hook", + "parsedDocstring": { + "text": "Register a hook to be called before each navigation.\n", + "args": { + "hook": "A coroutine function to be called before each navigation." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 317 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a hook to be called before each navigation.\n" + } + ] + }, + "flags": {}, + "id": 1276, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "pre_navigation_hook", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A coroutine function to be called before each navigation." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1277, + "kind": 32768, + "kindString": "Parameter", + "name": "hook", + "type": { + "name": "Callable[[BasicCrawlingContext], Awaitable[None]]", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "AbstractHttpCrawler.pre_navigation_hook", + "target": 1275, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "AbstractHttpCrawler.pre_navigation_hook", + "target": 1275, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a hook to be called after each navigation.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4155, + "module": "crawlers._abstract_http._abstract_http_crawler", + "name": "post_navigation_hook", + "parsedDocstring": { + "text": "Register a hook to be called after each navigation.\n", + "args": { + "hook": "A coroutine function to be called after each navigation." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 325 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a hook to be called after each navigation.\n" + } + ] + }, + "flags": {}, + "id": 1279, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "post_navigation_hook", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A coroutine function to be called after each navigation." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1280, + "kind": 32768, + "kindString": "Parameter", + "name": "hook", + "type": { + "name": "Callable[[HttpCrawlingContext], Awaitable[None]]", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "AbstractHttpCrawler.post_navigation_hook", + "target": 1278, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "AbstractHttpCrawler.post_navigation_hook", + "target": 1278, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The logger used by the crawler." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 4156, + "module": "crawlers._basic._basic_crawler", + "name": "log", + "parsedDocstring": { + "text": "The logger used by the crawler." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 511 + } + ], + "type": { + "name": "logging.Logger", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.log", + "target": 1588, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The `Router` used to handle each individual crawling request." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 4157, + "module": "crawlers._basic._basic_crawler", + "name": "router", + "parsedDocstring": { + "text": "The `Router` used to handle each individual crawling request." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 516 + } + ], + "type": { + "name": "Router[TCrawlingContext]", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.router", + "target": 1589, + "type": "reference" + }, + "overwrites": { + "name": "BasicCrawler.router", + "target": 1590, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Statistics about the current (or last) crawler run." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 4158, + "module": "crawlers._basic._basic_crawler", + "name": "statistics", + "parsedDocstring": { + "text": "Statistics about the current (or last) crawler run." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 531 + } + ], + "type": { + "name": "Statistics[TStatisticsState]", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.statistics", + "target": 1593, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Set flag to stop crawler.\n\nThis stops current crawler run regardless of whether all requests were finished.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4159, + "module": "crawlers._basic._basic_crawler", + "name": "stop", + "parsedDocstring": { + "text": "Set flag to stop crawler.\n\nThis stops current crawler run regardless of whether all requests were finished.\n", + "args": { + "reason": "Reason for stopping that will be used in logs." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 535 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Set flag to stop crawler.\n\nThis stops current crawler run regardless of whether all requests were finished.\n" + } + ] + }, + "flags": {}, + "id": 1595, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "stop", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Reason for stopping that will be used in logs." + } + ] + }, + "defaultValue": "'Stop was called externally.'", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1596, + "kind": 32768, + "kindString": "Parameter", + "name": "reason", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.stop", + "target": 1594, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.stop", + "target": 1594, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the configured request manager. If none is configured, open and return the default request queue." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4160, + "module": "crawlers._basic._basic_crawler", + "name": "get_request_manager", + "parsedDocstring": { + "text": "Return the configured request manager. If none is configured, open and return the default request queue." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 613 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the configured request manager. If none is configured, open and return the default request queue." + } + ] + }, + "flags": {}, + "id": 1598, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_request_manager", + "parameters": [], + "type": { + "name": "RequestManager", + "type": "reference", + "target": "2339" + }, + "inheritedFrom": { + "name": "BasicCrawler.get_request_manager", + "target": 1597, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.get_request_manager", + "target": 1597, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the `Dataset` with the given ID or name. If none is provided, return the default one." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4161, + "module": "crawlers._basic._basic_crawler", + "name": "get_dataset", + "parsedDocstring": { + "text": "Return the `Dataset` with the given ID or name. If none is provided, return the default one." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 623 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the `Dataset` with the given ID or name. If none is provided, return the default one." + } + ] + }, + "flags": {}, + "id": 1600, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_dataset", + "parameters": [ + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1601, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1602, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1603, + "kind": 32768, + "kindString": "Parameter", + "name": "alias", + "type": { + "name": "str | None", + "type": "reference" + } + } + ], + "type": { + "name": "Dataset", + "type": "reference", + "target": "3766" + }, + "inheritedFrom": { + "name": "BasicCrawler.get_dataset", + "target": 1599, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.get_dataset", + "target": 1599, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the `KeyValueStore` with the given ID or name. If none is provided, return the default KVS." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4162, + "module": "crawlers._basic._basic_crawler", + "name": "get_key_value_store", + "parsedDocstring": { + "text": "Return the `KeyValueStore` with the given ID or name. If none is provided, return the default KVS." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 639 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the `KeyValueStore` with the given ID or name. If none is provided, return the default KVS." + } + ] + }, + "flags": {}, + "id": 1605, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_key_value_store", + "parameters": [ + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1606, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1607, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1608, + "kind": 32768, + "kindString": "Parameter", + "name": "alias", + "type": { + "name": "str | None", + "type": "reference" + } + } + ], + "type": { + "name": "KeyValueStore", + "type": "reference", + "target": "3700" + }, + "inheritedFrom": { + "name": "BasicCrawler.get_key_value_store", + "target": 1604, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.get_key_value_store", + "target": 1604, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a function to handle errors occurring in request handlers.\n\nThe error handler is invoked after a request handler error occurs and before a retry attempt." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4163, + "module": "crawlers._basic._basic_crawler", + "name": "error_handler", + "parsedDocstring": { + "text": "Register a function to handle errors occurring in request handlers.\n\nThe error handler is invoked after a request handler error occurs and before a retry attempt." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 655 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a function to handle errors occurring in request handlers.\n\nThe error handler is invoked after a request handler error occurs and before a retry attempt." + } + ] + }, + "flags": {}, + "id": 1610, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "error_handler", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1611, + "kind": 32768, + "kindString": "Parameter", + "name": "handler", + "type": { + "name": "ErrorHandler[TCrawlingContext | BasicCrawlingContext]", + "type": "reference" + } + } + ], + "type": { + "name": "ErrorHandler[TCrawlingContext]", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.error_handler", + "target": 1609, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.error_handler", + "target": 1609, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a function to handle requests that exceed the maximum retry limit.\n\nThe failed request handler is invoked when a request has failed all retry attempts." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4164, + "module": "crawlers._basic._basic_crawler", + "name": "failed_request_handler", + "parsedDocstring": { + "text": "Register a function to handle requests that exceed the maximum retry limit.\n\nThe failed request handler is invoked when a request has failed all retry attempts." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 665 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a function to handle requests that exceed the maximum retry limit.\n\nThe failed request handler is invoked when a request has failed all retry attempts." + } + ] + }, + "flags": {}, + "id": 1613, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "failed_request_handler", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1614, + "kind": 32768, + "kindString": "Parameter", + "name": "handler", + "type": { + "name": "FailedRequestHandler[TCrawlingContext | BasicCrawlingContext]", + "type": "reference" + } + } + ], + "type": { + "name": "FailedRequestHandler[TCrawlingContext]", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.failed_request_handler", + "target": 1612, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.failed_request_handler", + "target": 1612, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a function to handle skipped requests.\n\nThe skipped request handler is invoked when a request is skipped due to a collision or other reasons." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4165, + "module": "crawlers._basic._basic_crawler", + "name": "on_skipped_request", + "parsedDocstring": { + "text": "Register a function to handle skipped requests.\n\nThe skipped request handler is invoked when a request is skipped due to a collision or other reasons." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 675 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a function to handle skipped requests.\n\nThe skipped request handler is invoked when a request is skipped due to a collision or other reasons." + } + ] + }, + "flags": {}, + "id": 1616, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "on_skipped_request", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1617, + "kind": 32768, + "kindString": "Parameter", + "name": "callback", + "type": { + "name": "SkippedRequestCallback", + "type": "reference", + "target": "1524" + } + } + ], + "type": { + "name": "SkippedRequestCallback", + "type": "reference", + "target": "1524" + }, + "inheritedFrom": { + "name": "BasicCrawler.on_skipped_request", + "target": 1615, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.on_skipped_request", + "target": 1615, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Run the crawler until all requests are processed.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4166, + "module": "crawlers._basic._basic_crawler", + "name": "run", + "parsedDocstring": { + "text": "Run the crawler until all requests are processed.\n", + "args": { + "requests": "The requests to be enqueued before the crawler starts.", + "purge_request_queue": "If this is `True` and the crawler is not being run for the first time, the default\nrequest queue will be purged." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 683 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Run the crawler until all requests are processed.\n" + } + ] + }, + "flags": {}, + "id": 1619, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "run", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The requests to be enqueued before the crawler starts." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1620, + "kind": 32768, + "kindString": "Parameter", + "name": "requests", + "type": { + "name": "Sequence[str | Request] | None", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "If this is `True` and the crawler is not being run for the first time, the default\nrequest queue will be purged." + } + ] + }, + "defaultValue": "True", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1621, + "kind": 32768, + "kindString": "Parameter", + "name": "purge_request_queue", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "FinalStatistics", + "type": "reference", + "target": "2649" + }, + "inheritedFrom": { + "name": "BasicCrawler.run", + "target": 1618, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.run", + "target": 1618, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Add requests to the underlying request manager in batches.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4167, + "module": "crawlers._basic._basic_crawler", + "name": "add_requests", + "parsedDocstring": { + "text": "Add requests to the underlying request manager in batches.\n", + "args": { + "requests": "A list of requests to add to the queue.", + "forefront": "If True, add requests to the forefront of the queue.", + "batch_size": "The number of requests to add in one batch.", + "wait_time_between_batches": "Time to wait between adding batches.", + "wait_for_all_requests_to_be_added": "If True, wait for all requests to be added before returning.", + "wait_for_all_requests_to_be_added_timeout": "Timeout for waiting for all requests to be added." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 795 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Add requests to the underlying request manager in batches.\n" + } + ] + }, + "flags": {}, + "id": 1623, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "add_requests", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A list of requests to add to the queue." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1624, + "kind": 32768, + "kindString": "Parameter", + "name": "requests", + "type": { + "name": "Sequence[str | Request]", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, add requests to the forefront of the queue." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1625, + "kind": 32768, + "kindString": "Parameter", + "name": "forefront", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The number of requests to add in one batch." + } + ] + }, + "defaultValue": "1000", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1626, + "kind": 32768, + "kindString": "Parameter", + "name": "batch_size", + "type": { + "name": "int", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Time to wait between adding batches." + } + ] + }, + "defaultValue": "timedelta(0)", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1627, + "kind": 32768, + "kindString": "Parameter", + "name": "wait_time_between_batches", + "type": { + "name": "timedelta", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, wait for all requests to be added before returning." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1628, + "kind": 32768, + "kindString": "Parameter", + "name": "wait_for_all_requests_to_be_added", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Timeout for waiting for all requests to be added." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1629, + "kind": 32768, + "kindString": "Parameter", + "name": "wait_for_all_requests_to_be_added_timeout", + "type": { + "name": "timedelta | None", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.add_requests", + "target": 1622, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.add_requests", + "target": 1622, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4168, + "module": "crawlers._basic._basic_crawler", + "name": "use_state", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 843 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 1631, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "use_state", + "parameters": [ + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1632, + "kind": 32768, + "kindString": "Parameter", + "name": "default_value", + "type": { + "name": "dict[str, JsonSerializable] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "dict", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "JsonSerializable" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "dict", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "JsonSerializable" + } + ] + }, + "inheritedFrom": { + "name": "BasicCrawler.use_state", + "target": 1630, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.use_state", + "target": 1630, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Retrieve data from a `Dataset`.\n\nThis helper method simplifies the process of retrieving data from a `Dataset`. It opens the specified\none and then retrieves the data based on the provided parameters.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4169, + "module": "crawlers._basic._basic_crawler", + "name": "get_data", + "parsedDocstring": { + "text": "Retrieve data from a `Dataset`.\n\nThis helper method simplifies the process of retrieving data from a `Dataset`. It opens the specified\none and then retrieves the data based on the provided parameters.\n", + "args": { + "dataset_id": "The ID of the `Dataset`.", + "dataset_name": "The name of the `Dataset` (global scope, named storage).", + "dataset_alias": "The alias of the `Dataset` (run scope, unnamed storage).", + "kwargs": "Keyword arguments to be passed to the `Dataset.get_data()` method.\n" + }, + "returns": "The retrieved data." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 854 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "The retrieved data." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Retrieve data from a `Dataset`.\n\nThis helper method simplifies the process of retrieving data from a `Dataset`. It opens the specified\none and then retrieves the data based on the provided parameters.\n" + } + ] + }, + "flags": {}, + "id": 1634, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_data", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The ID of the `Dataset`." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1635, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_id", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The name of the `Dataset` (global scope, named storage)." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1636, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_name", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The alias of the `Dataset` (run scope, unnamed storage)." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1637, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_alias", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Keyword arguments to be passed to the `Dataset.get_data()` method.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1638, + "kind": 32768, + "kindString": "Parameter", + "name": "kwargs", + "type": { + "name": "Unpack[GetDataKwargs]", + "type": "reference" + } + } + ], + "type": { + "name": "DatasetItemsListPage", + "type": "reference", + "target": "3658" + }, + "inheritedFrom": { + "name": "BasicCrawler.get_data", + "target": 1633, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.get_data", + "target": 1633, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Export all items from a Dataset to a JSON or CSV file.\n\nThis method simplifies the process of exporting data collected during crawling. It automatically\ndetermines the export format based on the file extension (`.json` or `.csv`) and handles\nthe conversion of `Dataset` items to the appropriate format.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4170, + "module": "crawlers._basic._basic_crawler", + "name": "export_data", + "parsedDocstring": { + "text": "Export all items from a Dataset to a JSON or CSV file.\n\nThis method simplifies the process of exporting data collected during crawling. It automatically\ndetermines the export format based on the file extension (`.json` or `.csv`) and handles\nthe conversion of `Dataset` items to the appropriate format.\n", + "args": { + "path": "The destination file path. Must end with '.json' or '.csv'.", + "dataset_id": "The ID of the Dataset to export from.", + "dataset_name": "The name of the Dataset to export from (global scope, named storage).", + "dataset_alias": "The alias of the Dataset to export from (run scope, unnamed storage).", + "additional_kwargs": "Extra keyword arguments forwarded to the JSON/CSV exporter depending on the file format." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 884 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Export all items from a Dataset to a JSON or CSV file.\n\nThis method simplifies the process of exporting data collected during crawling. It automatically\ndetermines the export format based on the file extension (`.json` or `.csv`) and handles\nthe conversion of `Dataset` items to the appropriate format.\n" + } + ] + }, + "flags": {}, + "id": 1640, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "export_data", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The destination file path. Must end with '.json' or '.csv'." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1641, + "kind": 32768, + "kindString": "Parameter", + "name": "path", + "type": { + "name": "str | Path", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The ID of the Dataset to export from." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1642, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_id", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The name of the Dataset to export from (global scope, named storage)." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1643, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_name", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The alias of the Dataset to export from (run scope, unnamed storage)." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1644, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_alias", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Extra keyword arguments forwarded to the JSON/CSV exporter depending on the file format." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1645, + "kind": 32768, + "kindString": "Parameter", + "name": "additional_kwargs", + "type": { + "name": "Unpack[ExportDataJsonKwargs | ExportDataCsvKwargs]", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.export_data", + "target": 1639, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.export_data", + "target": 1639, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Specific version of generic `AbstractHttpCrawler`.\n\nIt uses a dummy parser that simply returns the HTTP response body as-is. Use this only if you know what you are\ndoing. In most cases, using an HTML parser would be more beneficial. For such scenarios, consider using\n`BeautifulSoupCrawler`, `ParselCrawler`, or writing your own subclass of `AbstractHttpCrawler`.\n\n### Usage\n\n```python\nfrom crawlee.crawlers import HttpCrawler, HttpCrawlingContext\n\ncrawler = HttpCrawler()\n\n# Define the default request handler, which will be called for every request.\n@crawler.router.default_handler\nasync def request_handler(context: HttpCrawlingContext) -> None:\n context.log.info(f'Processing {context.request.url} ...')\n\n # Extract data from the page.\n data = {\n 'url': context.request.url,\n 'response': (await context.http_response.read()).decode()[:100],\n }\n\n # Push the extracted data to the default dataset.\n await context.push_data(data)\n\nawait crawler.run(['https://crawlee.dev/'])\n```" + } + ] + }, + "decorations": [ + { + "args": "('Crawlers')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 1686, + 4167, + 4153, + 4163, + 4170, + 4164, + 4169, + 4161, + 4162, + 4160, + 4165, + 4155, + 4154, + 4166, + 4159, + 4168 + ], + "title": "Methods" + }, + { + "children": [ + 4156, + 4157, + 4158 + ], + "title": "Properties" + } + ], + "id": 1685, + "module": "crawlers._http._http_crawler", + "name": "HttpCrawler", + "parsedDocstring": { + "text": "Specific version of generic `AbstractHttpCrawler`.\n\nIt uses a dummy parser that simply returns the HTTP response body as-is. Use this only if you know what you are\ndoing. In most cases, using an HTML parser would be more beneficial. For such scenarios, consider using\n`BeautifulSoupCrawler`, `ParselCrawler`, or writing your own subclass of `AbstractHttpCrawler`.\n\n### Usage\n\n```python\nfrom crawlee.crawlers import HttpCrawler, HttpCrawlingContext\n\ncrawler = HttpCrawler()\n\n# Define the default request handler, which will be called for every request.\n@crawler.router.default_handler\nasync def request_handler(context: HttpCrawlingContext) -> None:\n context.log.info(f'Processing {context.request.url} ...')\n\n # Extract data from the page.\n data = {\n 'url': context.request.url,\n 'response': (await context.http_response.read()).decode()[:100],\n }\n\n # Push the extracted data to the default dataset.\n await context.push_data(data)\n\nawait crawler.run(['https://crawlee.dev/'])\n```" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_http/_http_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 17 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "AbstractHttpCrawler", + "target": "1266", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Parse HTTP response.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1690, + "module": "crawlers._abstract_http._abstract_http_parser", + "name": "parse", + "parsedDocstring": { + "text": "Parse HTTP response.\n", + "args": { + "response": "HTTP response to be parsed.\n" + }, + "returns": "Parsed HTTP response." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_http/_http_parser.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 26 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "Parsed HTTP response." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Parse HTTP response.\n" + } + ] + }, + "flags": {}, + "id": 1283, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "parse", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP response to be parsed.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1284, + "kind": 32768, + "kindString": "Parameter", + "name": "response", + "type": { + "name": "HttpResponse", + "type": "reference", + "target": "2060" + } + } + ], + "type": { + "name": "TParseResult", + "type": "reference", + "target": "1243" + }, + "overwrites": { + "name": "AbstractHttpParser.parse", + "target": 1282, + "type": "reference" + } + } + ], + "overwrites": { + "name": "AbstractHttpParser.parse", + "target": 1282, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Parse text containing html.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1693, + "module": "crawlers._abstract_http._abstract_http_parser", + "name": "parse_text", + "parsedDocstring": { + "text": "Parse text containing html.\n", + "args": { + "text": "String containing html.\n" + }, + "returns": "Parsed text." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_http/_http_parser.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 30 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "Parsed text." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Parse text containing html.\n" + } + ] + }, + "flags": {}, + "id": 1286, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "parse_text", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "String containing html.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1287, + "kind": 32768, + "kindString": "Parameter", + "name": "text", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "TParseResult", + "type": "reference", + "target": "1243" + }, + "overwrites": { + "name": "AbstractHttpParser.parse_text", + "target": 1285, + "type": "reference" + } + } + ], + "overwrites": { + "name": "AbstractHttpParser.parse_text", + "target": 1285, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Use css selector to select page element and return it.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1696, + "module": "crawlers._abstract_http._abstract_http_parser", + "name": "select", + "parsedDocstring": { + "text": "Use css selector to select page element and return it.\n", + "args": { + "parsed_content": "Content where the page element will be located.", + "selector": "Css selector used to locate desired html element.\n" + }, + "returns": "Selected element." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_http/_http_parser.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 34 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "Selected element." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Use css selector to select page element and return it.\n" + } + ] + }, + "flags": {}, + "id": 1289, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "select", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Content where the page element will be located." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1290, + "kind": 32768, + "kindString": "Parameter", + "name": "parsed_content", + "type": { + "name": "TParseResult", + "type": "reference", + "target": "1243" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Css selector used to locate desired html element.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1291, + "kind": 32768, + "kindString": "Parameter", + "name": "selector", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "Sequence", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "TSelectResult", + "target": "1244" + } + ] + }, + "overwrites": { + "name": "AbstractHttpParser.select", + "target": 1288, + "type": "reference" + } + } + ], + "overwrites": { + "name": "AbstractHttpParser.select", + "target": 1288, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Detect if blocked and return BlockedInfo with additional information.\n\nDefault implementation that expects `is_matching_selector` abstract method to be implemented.\nOverride this method if your parser has different way of blockage detection.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1700, + "module": "crawlers._abstract_http._abstract_http_parser", + "name": "is_blocked", + "parsedDocstring": { + "text": "Detect if blocked and return BlockedInfo with additional information.\n\nDefault implementation that expects `is_matching_selector` abstract method to be implemented.\nOverride this method if your parser has different way of blockage detection.\n", + "args": { + "parsed_content": "Parsed HTTP response. Result of `parse` method.\n" + }, + "returns": "`BlockedInfo` object that contains non-empty string description of reason if blockage was detected. Empty\nstring in reason signifies no blockage detected." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_http/_http_parser.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 38 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "`BlockedInfo` object that contains non-empty string description of reason if blockage was detected. Empty\nstring in reason signifies no blockage detected." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Detect if blocked and return BlockedInfo with additional information.\n\nDefault implementation that expects `is_matching_selector` abstract method to be implemented.\nOverride this method if your parser has different way of blockage detection.\n" + } + ] + }, + "flags": {}, + "id": 1293, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "is_blocked", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Parsed HTTP response. Result of `parse` method.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1294, + "kind": 32768, + "kindString": "Parameter", + "name": "parsed_content", + "type": { + "name": "TParseResult", + "type": "reference", + "target": "1243" + } + } + ], + "type": { + "name": "BlockedInfo", + "type": "reference", + "target": "1848" + }, + "overwrites": { + "name": "AbstractHttpParser.is_blocked", + "target": 1292, + "type": "reference" + } + } + ], + "overwrites": { + "name": "AbstractHttpParser.is_blocked", + "target": 1292, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Find if selector has match in parsed content.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1703, + "module": "crawlers._abstract_http._abstract_http_parser", + "name": "is_matching_selector", + "parsedDocstring": { + "text": "Find if selector has match in parsed content.\n", + "args": { + "parsed_content": "Parsed HTTP response. Result of `parse` method.", + "selector": "String used to define matching pattern.\n" + }, + "returns": "True if selector has match in parsed content." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_http/_http_parser.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 42 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "True if selector has match in parsed content." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Find if selector has match in parsed content.\n" + } + ] + }, + "flags": {}, + "id": 1296, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "is_matching_selector", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Parsed HTTP response. Result of `parse` method." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1297, + "kind": 32768, + "kindString": "Parameter", + "name": "parsed_content", + "type": { + "name": "TParseResult", + "type": "reference", + "target": "1243" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "String used to define matching pattern.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1298, + "kind": 32768, + "kindString": "Parameter", + "name": "selector", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "bool", + "type": "reference" + }, + "overwrites": { + "name": "AbstractHttpParser.is_matching_selector", + "target": 1295, + "type": "reference" + } + } + ], + "overwrites": { + "name": "AbstractHttpParser.is_matching_selector", + "target": 1295, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Find all links in result using selector.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1707, + "module": "crawlers._abstract_http._abstract_http_parser", + "name": "find_links", + "parsedDocstring": { + "text": "Find all links in result using selector.\n", + "args": { + "parsed_content": "Parsed HTTP response. Result of `parse` method.", + "selector": "String used to define matching pattern for finding links.", + "attribute": "Which node attribute to extract the links from.\n" + }, + "returns": "Iterable of strings that contain found links." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_http/_http_parser.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 46 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "Iterable of strings that contain found links." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Find all links in result using selector.\n" + } + ] + }, + "flags": {}, + "id": 1300, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "find_links", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Parsed HTTP response. Result of `parse` method." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1301, + "kind": 32768, + "kindString": "Parameter", + "name": "parsed_content", + "type": { + "name": "TParseResult", + "type": "reference", + "target": "1243" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "String used to define matching pattern for finding links." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1302, + "kind": 32768, + "kindString": "Parameter", + "name": "selector", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Which node attribute to extract the links from.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1303, + "kind": 32768, + "kindString": "Parameter", + "name": "attribute", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "Iterable", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + }, + "overwrites": { + "name": "AbstractHttpParser.find_links", + "target": 1299, + "type": "reference" + } + } + ], + "overwrites": { + "name": "AbstractHttpParser.find_links", + "target": 1299, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A no-op parser that returns raw response content without any processing.\n\nThis is useful when you only need the raw response data and don't require HTML\nparsing, link extraction, or content selection functionality." + } + ] + }, + "decorations": [ + { + "args": "('HTTP parsers')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 1707, + 1700, + 1703, + 1690, + 1693, + 1696 + ], + "title": "Methods" + } + ], + "id": 1689, + "module": "crawlers._http._http_parser", + "name": "NoParser", + "parsedDocstring": { + "text": "A no-op parser that returns raw response content without any processing.\n\nThis is useful when you only need the raw response data and don't require HTML\nparsing, link extraction, or content selection functionality." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_http/_http_parser.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 18 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "AbstractHttpParser", + "target": "1281", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1713, + "module": "crawlers._parsel._parsel_crawler", + "name": "__init__", + "parsedDocstring": { + "text": "Initialize a new instance.\n", + "args": { + "kwargs": "Additional keyword arguments to pass to the underlying `AbstractHttpCrawler`." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_parsel/_parsel_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 57 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n" + } + ] + }, + "flags": {}, + "id": 1714, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Timeout for the HTTP request." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 1265, + "module": "crawlers._abstract_http._abstract_http_crawler", + "name": "navigation_timeout", + "parsedDocstring": { + "text": "Timeout for the HTTP request." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 46 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "timedelta" + }, + { + "type": "literal", + "value": null + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A callable responsible for handling requests." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4189, + "module": "crawlers._basic._basic_crawler", + "name": "request_handler", + "parsedDocstring": { + "text": "A callable responsible for handling requests." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 224 + } + ], + "type": { + "name": "NotRequired[Callable[[TCrawlingContext], Awaitable[None]]]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptionsGeneric.request_handler", + "target": 1552, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A custom `Statistics` instance, allowing the use of non-default configuration." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4190, + "module": "crawlers._basic._basic_crawler", + "name": "statistics", + "parsedDocstring": { + "text": "A custom `Statistics` instance, allowing the use of non-default configuration." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 231 + } + ], + "type": { + "name": "NotRequired[Statistics[TStatisticsState]]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptionsGeneric.statistics", + "target": 1553, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The `Configuration` instance. Some of its properties are used as defaults for the crawler." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4191, + "module": "crawlers._basic._basic_crawler", + "name": "configuration", + "parsedDocstring": { + "text": "The `Configuration` instance. Some of its properties are used as defaults for the crawler." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 116 + } + ], + "type": { + "name": "NotRequired[Configuration]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.configuration", + "target": 1526, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The event manager for managing events for the crawler and all its components." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4192, + "module": "crawlers._basic._basic_crawler", + "name": "event_manager", + "parsedDocstring": { + "text": "The event manager for managing events for the crawler and all its components." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 119 + } + ], + "type": { + "name": "NotRequired[EventManager]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.event_manager", + "target": 1527, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The storage client for managing storages for the crawler and all its components." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4193, + "module": "crawlers._basic._basic_crawler", + "name": "storage_client", + "parsedDocstring": { + "text": "The storage client for managing storages for the crawler and all its components." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 122 + } + ], + "type": { + "name": "NotRequired[StorageClient]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.storage_client", + "target": 1528, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Manager of requests that should be processed by the crawler." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4194, + "module": "crawlers._basic._basic_crawler", + "name": "request_manager", + "parsedDocstring": { + "text": "Manager of requests that should be processed by the crawler." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 125 + } + ], + "type": { + "name": "NotRequired[RequestManager]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.request_manager", + "target": 1529, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A custom `SessionPool` instance, allowing the use of non-default configuration." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4195, + "module": "crawlers._basic._basic_crawler", + "name": "session_pool", + "parsedDocstring": { + "text": "A custom `SessionPool` instance, allowing the use of non-default configuration." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 128 + } + ], + "type": { + "name": "NotRequired[SessionPool]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.session_pool", + "target": 1530, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP proxy configuration used when making requests." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4196, + "module": "crawlers._basic._basic_crawler", + "name": "proxy_configuration", + "parsedDocstring": { + "text": "HTTP proxy configuration used when making requests." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 131 + } + ], + "type": { + "name": "NotRequired[ProxyConfiguration]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.proxy_configuration", + "target": 1531, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP client used by `BasicCrawlingContext.send_request` method." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4197, + "module": "crawlers._basic._basic_crawler", + "name": "http_client", + "parsedDocstring": { + "text": "HTTP client used by `BasicCrawlingContext.send_request` method." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 134 + } + ], + "type": { + "name": "NotRequired[HttpClient]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.http_client", + "target": 1532, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Specifies the maximum number of retries allowed for a request if its processing fails.\nThis includes retries due to navigation errors or errors thrown from user-supplied functions\n(`request_handler`, `pre_navigation_hooks` etc.).\n\nThis limit does not apply to retries triggered by session rotation (see `max_session_rotations`)." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4198, + "module": "crawlers._basic._basic_crawler", + "name": "max_request_retries", + "parsedDocstring": { + "text": "Specifies the maximum number of retries allowed for a request if its processing fails.\nThis includes retries due to navigation errors or errors thrown from user-supplied functions\n(`request_handler`, `pre_navigation_hooks` etc.).\n\nThis limit does not apply to retries triggered by session rotation (see `max_session_rotations`)." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 137 + } + ], + "type": { + "name": "NotRequired[int]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.max_request_retries", + "target": 1533, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum number of pages to open during a crawl. The crawl stops upon reaching this limit.\nSetting this value can help avoid infinite loops in misconfigured crawlers. `None` means no limit.\nDue to concurrency settings, the actual number of pages visited may slightly exceed this value." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4199, + "module": "crawlers._basic._basic_crawler", + "name": "max_requests_per_crawl", + "parsedDocstring": { + "text": "Maximum number of pages to open during a crawl. The crawl stops upon reaching this limit.\nSetting this value can help avoid infinite loops in misconfigured crawlers. `None` means no limit.\nDue to concurrency settings, the actual number of pages visited may slightly exceed this value." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 144 + } + ], + "type": { + "name": "NotRequired[int | None]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.max_requests_per_crawl", + "target": 1534, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum number of session rotations per request. The crawler rotates the session if a proxy error occurs\nor if the website blocks the request.\n\nThe session rotations are not counted towards the `max_request_retries` limit." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4200, + "module": "crawlers._basic._basic_crawler", + "name": "max_session_rotations", + "parsedDocstring": { + "text": "Maximum number of session rotations per request. The crawler rotates the session if a proxy error occurs\nor if the website blocks the request.\n\nThe session rotations are not counted towards the `max_request_retries` limit." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 149 + } + ], + "type": { + "name": "NotRequired[int]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.max_session_rotations", + "target": 1535, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Specifies the maximum crawl depth. If set, the crawler will stop processing links beyond this depth.\nThe crawl depth starts at 0 for initial requests and increases with each subsequent level of links.\nRequests at the maximum depth will still be processed, but no new links will be enqueued from those requests.\nIf not set, crawling continues without depth restrictions." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4201, + "module": "crawlers._basic._basic_crawler", + "name": "max_crawl_depth", + "parsedDocstring": { + "text": "Specifies the maximum crawl depth. If set, the crawler will stop processing links beyond this depth.\nThe crawl depth starts at 0 for initial requests and increases with each subsequent level of links.\nRequests at the maximum depth will still be processed, but no new links will be enqueued from those requests.\nIf not set, crawling continues without depth restrictions." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 156 + } + ], + "type": { + "name": "NotRequired[int | None]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.max_crawl_depth", + "target": 1536, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Enable the use of a session pool for managing sessions during crawling." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4202, + "module": "crawlers._basic._basic_crawler", + "name": "use_session_pool", + "parsedDocstring": { + "text": "Enable the use of a session pool for managing sessions during crawling." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 163 + } + ], + "type": { + "name": "NotRequired[bool]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.use_session_pool", + "target": 1537, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, the crawler attempts to bypass bot protections automatically." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4203, + "module": "crawlers._basic._basic_crawler", + "name": "retry_on_blocked", + "parsedDocstring": { + "text": "If True, the crawler attempts to bypass bot protections automatically." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 166 + } + ], + "type": { + "name": "NotRequired[bool]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.retry_on_blocked", + "target": 1538, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Settings to fine-tune concurrency levels." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4204, + "module": "crawlers._basic._basic_crawler", + "name": "concurrency_settings", + "parsedDocstring": { + "text": "Settings to fine-tune concurrency levels." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 169 + } + ], + "type": { + "name": "NotRequired[ConcurrencySettings]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.concurrency_settings", + "target": 1539, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum duration allowed for a single request handler to run." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4205, + "module": "crawlers._basic._basic_crawler", + "name": "request_handler_timeout", + "parsedDocstring": { + "text": "Maximum duration allowed for a single request handler to run." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 172 + } + ], + "type": { + "name": "NotRequired[timedelta]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.request_handler_timeout", + "target": 1540, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, the crawler stops immediately when any request handler error occurs." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4206, + "module": "crawlers._basic._basic_crawler", + "name": "abort_on_error", + "parsedDocstring": { + "text": "If True, the crawler stops immediately when any request handler error occurs." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 175 + } + ], + "type": { + "name": "NotRequired[bool]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.abort_on_error", + "target": 1541, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, the crawler will set up logging infrastructure automatically." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4207, + "module": "crawlers._basic._basic_crawler", + "name": "configure_logging", + "parsedDocstring": { + "text": "If True, the crawler will set up logging infrastructure automatically." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 178 + } + ], + "type": { + "name": "NotRequired[bool]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.configure_logging", + "target": 1542, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If 'table', displays crawler statistics as formatted tables in logs. If 'inline', outputs statistics as plain\ntext log messages." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4208, + "module": "crawlers._basic._basic_crawler", + "name": "statistics_log_format", + "parsedDocstring": { + "text": "If 'table', displays crawler statistics as formatted tables in logs. If 'inline', outputs statistics as plain\ntext log messages." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 181 + } + ], + "type": { + "name": "NotRequired[Literal['table', 'inline']]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.statistics_log_format", + "target": 1543, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Flag that can keep crawler running even when there are no requests in queue." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4209, + "module": "crawlers._basic._basic_crawler", + "name": "keep_alive", + "parsedDocstring": { + "text": "Flag that can keep crawler running even when there are no requests in queue." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 186 + } + ], + "type": { + "name": "NotRequired[bool]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.keep_alive", + "target": 1544, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Additional HTTP status codes to treat as errors, triggering automatic retries when encountered." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4210, + "module": "crawlers._basic._basic_crawler", + "name": "additional_http_error_status_codes", + "parsedDocstring": { + "text": "Additional HTTP status codes to treat as errors, triggering automatic retries when encountered." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 189 + } + ], + "type": { + "name": "NotRequired[Iterable[int]]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.additional_http_error_status_codes", + "target": 1545, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP status codes that are typically considered errors but should be treated as successful responses." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4211, + "module": "crawlers._basic._basic_crawler", + "name": "ignore_http_error_status_codes", + "parsedDocstring": { + "text": "HTTP status codes that are typically considered errors but should be treated as successful responses." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 192 + } + ], + "type": { + "name": "NotRequired[Iterable[int]]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.ignore_http_error_status_codes", + "target": 1546, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If set to `True`, the crawler will automatically try to fetch the robots.txt file for each domain,\nand skip those that are not allowed. This also prevents disallowed URLs to be added via `EnqueueLinksFunction`." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4212, + "module": "crawlers._basic._basic_crawler", + "name": "respect_robots_txt_file", + "parsedDocstring": { + "text": "If set to `True`, the crawler will automatically try to fetch the robots.txt file for each domain,\nand skip those that are not allowed. This also prevents disallowed URLs to be added via `EnqueueLinksFunction`." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 203 + } + ], + "type": { + "name": "NotRequired[bool]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.respect_robots_txt_file", + "target": 1547, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Interval for logging the crawler status messages." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4213, + "module": "crawlers._basic._basic_crawler", + "name": "status_message_logging_interval", + "parsedDocstring": { + "text": "Interval for logging the crawler status messages." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 207 + } + ], + "type": { + "name": "NotRequired[timedelta]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.status_message_logging_interval", + "target": 1548, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Allows overriding the default status message. The default status message is provided in the parameters.\nReturning `None` suppresses the status message." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4214, + "module": "crawlers._basic._basic_crawler", + "name": "status_message_callback", + "parsedDocstring": { + "text": "Allows overriding the default status message. The default status message is provided in the parameters.\nReturning `None` suppresses the status message." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 210 + } + ], + "type": { + "name": "NotRequired[ Callable[[StatisticsState, StatisticsState | None, str], Awaitable[str | None]] ]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.status_message_callback", + "target": 1549, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Identifier used for crawler state tracking. Use the same id across multiple crawlers to share state between\nthem." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4215, + "module": "crawlers._basic._basic_crawler", + "name": "id", + "parsedDocstring": { + "text": "Identifier used for crawler state tracking. Use the same id across multiple crawlers to share state between\nthem." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 216 + } + ], + "type": { + "name": "NotRequired[int]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.id", + "target": 1550, + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "AbstractHttpCrawler.__init__", + "target": 1267, + "type": "reference" + } + } + ], + "overwrites": { + "name": "AbstractHttpCrawler.__init__", + "target": 1267, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a specific version of `AbstractHttpCrawler` class.\n\nThis is a convenience factory method for creating a specific `AbstractHttpCrawler` subclass.\nWhile `AbstractHttpCrawler` allows its two generic parameters to be independent,\nthis method simplifies cases where `TParseResult` is used for both generic parameters." + } + ] + }, + "decorations": [ + { + "name": "classmethod" + } + ], + "flags": {}, + "groups": [], + "id": 4171, + "module": "crawlers._abstract_http._abstract_http_crawler", + "name": "create_parsed_http_crawler_class", + "parsedDocstring": { + "text": "Create a specific version of `AbstractHttpCrawler` class.\n\nThis is a convenience factory method for creating a specific `AbstractHttpCrawler` subclass.\nWhile `AbstractHttpCrawler` allows its two generic parameters to be independent,\nthis method simplifies cases where `TParseResult` is used for both generic parameters." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 93 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a specific version of `AbstractHttpCrawler` class.\n\nThis is a convenience factory method for creating a specific `AbstractHttpCrawler` subclass.\nWhile `AbstractHttpCrawler` allows its two generic parameters to be independent,\nthis method simplifies cases where `TParseResult` is used for both generic parameters." + } + ] + }, + "flags": {}, + "id": 1273, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "create_parsed_http_crawler_class", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1274, + "kind": 32768, + "kindString": "Parameter", + "name": "static_parser", + "type": { + "name": "AbstractHttpParser[TParseResult, TSelectResult]", + "type": "reference" + } + } + ], + "type": { + "name": "type[AbstractHttpCrawler[ParsedHttpCrawlingContext[TParseResult], TParseResult, TSelectResult]]", + "type": "reference" + }, + "inheritedFrom": { + "name": "AbstractHttpCrawler.create_parsed_http_crawler_class", + "target": 1272, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "AbstractHttpCrawler.create_parsed_http_crawler_class", + "target": 1272, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a hook to be called before each navigation.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4172, + "module": "crawlers._abstract_http._abstract_http_crawler", + "name": "pre_navigation_hook", + "parsedDocstring": { + "text": "Register a hook to be called before each navigation.\n", + "args": { + "hook": "A coroutine function to be called before each navigation." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 317 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a hook to be called before each navigation.\n" + } + ] + }, + "flags": {}, + "id": 1276, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "pre_navigation_hook", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A coroutine function to be called before each navigation." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1277, + "kind": 32768, + "kindString": "Parameter", + "name": "hook", + "type": { + "name": "Callable[[BasicCrawlingContext], Awaitable[None]]", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "AbstractHttpCrawler.pre_navigation_hook", + "target": 1275, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "AbstractHttpCrawler.pre_navigation_hook", + "target": 1275, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a hook to be called after each navigation.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4173, + "module": "crawlers._abstract_http._abstract_http_crawler", + "name": "post_navigation_hook", + "parsedDocstring": { + "text": "Register a hook to be called after each navigation.\n", + "args": { + "hook": "A coroutine function to be called after each navigation." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 325 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a hook to be called after each navigation.\n" + } + ] + }, + "flags": {}, + "id": 1279, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "post_navigation_hook", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A coroutine function to be called after each navigation." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1280, + "kind": 32768, + "kindString": "Parameter", + "name": "hook", + "type": { + "name": "Callable[[HttpCrawlingContext], Awaitable[None]]", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "AbstractHttpCrawler.post_navigation_hook", + "target": 1278, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "AbstractHttpCrawler.post_navigation_hook", + "target": 1278, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The logger used by the crawler." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 4174, + "module": "crawlers._basic._basic_crawler", + "name": "log", + "parsedDocstring": { + "text": "The logger used by the crawler." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 511 + } + ], + "type": { + "name": "logging.Logger", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.log", + "target": 1588, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The `Router` used to handle each individual crawling request." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 4175, + "module": "crawlers._basic._basic_crawler", + "name": "router", + "parsedDocstring": { + "text": "The `Router` used to handle each individual crawling request." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 516 + } + ], + "type": { + "name": "Router[TCrawlingContext]", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.router", + "target": 1589, + "type": "reference" + }, + "overwrites": { + "name": "BasicCrawler.router", + "target": 1590, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Statistics about the current (or last) crawler run." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 4176, + "module": "crawlers._basic._basic_crawler", + "name": "statistics", + "parsedDocstring": { + "text": "Statistics about the current (or last) crawler run." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 531 + } + ], + "type": { + "name": "Statistics[TStatisticsState]", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.statistics", + "target": 1593, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Set flag to stop crawler.\n\nThis stops current crawler run regardless of whether all requests were finished.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4177, + "module": "crawlers._basic._basic_crawler", + "name": "stop", + "parsedDocstring": { + "text": "Set flag to stop crawler.\n\nThis stops current crawler run regardless of whether all requests were finished.\n", + "args": { + "reason": "Reason for stopping that will be used in logs." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 535 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Set flag to stop crawler.\n\nThis stops current crawler run regardless of whether all requests were finished.\n" + } + ] + }, + "flags": {}, + "id": 1595, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "stop", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Reason for stopping that will be used in logs." + } + ] + }, + "defaultValue": "'Stop was called externally.'", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1596, + "kind": 32768, + "kindString": "Parameter", + "name": "reason", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.stop", + "target": 1594, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.stop", + "target": 1594, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the configured request manager. If none is configured, open and return the default request queue." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4178, + "module": "crawlers._basic._basic_crawler", + "name": "get_request_manager", + "parsedDocstring": { + "text": "Return the configured request manager. If none is configured, open and return the default request queue." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 613 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the configured request manager. If none is configured, open and return the default request queue." + } + ] + }, + "flags": {}, + "id": 1598, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_request_manager", + "parameters": [], + "type": { + "name": "RequestManager", + "type": "reference", + "target": "2339" + }, + "inheritedFrom": { + "name": "BasicCrawler.get_request_manager", + "target": 1597, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.get_request_manager", + "target": 1597, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the `Dataset` with the given ID or name. If none is provided, return the default one." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4179, + "module": "crawlers._basic._basic_crawler", + "name": "get_dataset", + "parsedDocstring": { + "text": "Return the `Dataset` with the given ID or name. If none is provided, return the default one." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 623 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the `Dataset` with the given ID or name. If none is provided, return the default one." + } + ] + }, + "flags": {}, + "id": 1600, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_dataset", + "parameters": [ + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1601, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1602, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1603, + "kind": 32768, + "kindString": "Parameter", + "name": "alias", + "type": { + "name": "str | None", + "type": "reference" + } + } + ], + "type": { + "name": "Dataset", + "type": "reference", + "target": "3766" + }, + "inheritedFrom": { + "name": "BasicCrawler.get_dataset", + "target": 1599, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.get_dataset", + "target": 1599, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the `KeyValueStore` with the given ID or name. If none is provided, return the default KVS." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4180, + "module": "crawlers._basic._basic_crawler", + "name": "get_key_value_store", + "parsedDocstring": { + "text": "Return the `KeyValueStore` with the given ID or name. If none is provided, return the default KVS." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 639 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the `KeyValueStore` with the given ID or name. If none is provided, return the default KVS." + } + ] + }, + "flags": {}, + "id": 1605, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_key_value_store", + "parameters": [ + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1606, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1607, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1608, + "kind": 32768, + "kindString": "Parameter", + "name": "alias", + "type": { + "name": "str | None", + "type": "reference" + } + } + ], + "type": { + "name": "KeyValueStore", + "type": "reference", + "target": "3700" + }, + "inheritedFrom": { + "name": "BasicCrawler.get_key_value_store", + "target": 1604, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.get_key_value_store", + "target": 1604, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a function to handle errors occurring in request handlers.\n\nThe error handler is invoked after a request handler error occurs and before a retry attempt." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4181, + "module": "crawlers._basic._basic_crawler", + "name": "error_handler", + "parsedDocstring": { + "text": "Register a function to handle errors occurring in request handlers.\n\nThe error handler is invoked after a request handler error occurs and before a retry attempt." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 655 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a function to handle errors occurring in request handlers.\n\nThe error handler is invoked after a request handler error occurs and before a retry attempt." + } + ] + }, + "flags": {}, + "id": 1610, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "error_handler", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1611, + "kind": 32768, + "kindString": "Parameter", + "name": "handler", + "type": { + "name": "ErrorHandler[TCrawlingContext | BasicCrawlingContext]", + "type": "reference" + } + } + ], + "type": { + "name": "ErrorHandler[TCrawlingContext]", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.error_handler", + "target": 1609, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.error_handler", + "target": 1609, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a function to handle requests that exceed the maximum retry limit.\n\nThe failed request handler is invoked when a request has failed all retry attempts." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4182, + "module": "crawlers._basic._basic_crawler", + "name": "failed_request_handler", + "parsedDocstring": { + "text": "Register a function to handle requests that exceed the maximum retry limit.\n\nThe failed request handler is invoked when a request has failed all retry attempts." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 665 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a function to handle requests that exceed the maximum retry limit.\n\nThe failed request handler is invoked when a request has failed all retry attempts." + } + ] + }, + "flags": {}, + "id": 1613, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "failed_request_handler", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1614, + "kind": 32768, + "kindString": "Parameter", + "name": "handler", + "type": { + "name": "FailedRequestHandler[TCrawlingContext | BasicCrawlingContext]", + "type": "reference" + } + } + ], + "type": { + "name": "FailedRequestHandler[TCrawlingContext]", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.failed_request_handler", + "target": 1612, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.failed_request_handler", + "target": 1612, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a function to handle skipped requests.\n\nThe skipped request handler is invoked when a request is skipped due to a collision or other reasons." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4183, + "module": "crawlers._basic._basic_crawler", + "name": "on_skipped_request", + "parsedDocstring": { + "text": "Register a function to handle skipped requests.\n\nThe skipped request handler is invoked when a request is skipped due to a collision or other reasons." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 675 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a function to handle skipped requests.\n\nThe skipped request handler is invoked when a request is skipped due to a collision or other reasons." + } + ] + }, + "flags": {}, + "id": 1616, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "on_skipped_request", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1617, + "kind": 32768, + "kindString": "Parameter", + "name": "callback", + "type": { + "name": "SkippedRequestCallback", + "type": "reference", + "target": "1524" + } + } + ], + "type": { + "name": "SkippedRequestCallback", + "type": "reference", + "target": "1524" + }, + "inheritedFrom": { + "name": "BasicCrawler.on_skipped_request", + "target": 1615, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.on_skipped_request", + "target": 1615, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Run the crawler until all requests are processed.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4184, + "module": "crawlers._basic._basic_crawler", + "name": "run", + "parsedDocstring": { + "text": "Run the crawler until all requests are processed.\n", + "args": { + "requests": "The requests to be enqueued before the crawler starts.", + "purge_request_queue": "If this is `True` and the crawler is not being run for the first time, the default\nrequest queue will be purged." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 683 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Run the crawler until all requests are processed.\n" + } + ] + }, + "flags": {}, + "id": 1619, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "run", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The requests to be enqueued before the crawler starts." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1620, + "kind": 32768, + "kindString": "Parameter", + "name": "requests", + "type": { + "name": "Sequence[str | Request] | None", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "If this is `True` and the crawler is not being run for the first time, the default\nrequest queue will be purged." + } + ] + }, + "defaultValue": "True", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1621, + "kind": 32768, + "kindString": "Parameter", + "name": "purge_request_queue", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "FinalStatistics", + "type": "reference", + "target": "2649" + }, + "inheritedFrom": { + "name": "BasicCrawler.run", + "target": 1618, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.run", + "target": 1618, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Add requests to the underlying request manager in batches.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4185, + "module": "crawlers._basic._basic_crawler", + "name": "add_requests", + "parsedDocstring": { + "text": "Add requests to the underlying request manager in batches.\n", + "args": { + "requests": "A list of requests to add to the queue.", + "forefront": "If True, add requests to the forefront of the queue.", + "batch_size": "The number of requests to add in one batch.", + "wait_time_between_batches": "Time to wait between adding batches.", + "wait_for_all_requests_to_be_added": "If True, wait for all requests to be added before returning.", + "wait_for_all_requests_to_be_added_timeout": "Timeout for waiting for all requests to be added." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 795 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Add requests to the underlying request manager in batches.\n" + } + ] + }, + "flags": {}, + "id": 1623, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "add_requests", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A list of requests to add to the queue." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1624, + "kind": 32768, + "kindString": "Parameter", + "name": "requests", + "type": { + "name": "Sequence[str | Request]", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, add requests to the forefront of the queue." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1625, + "kind": 32768, + "kindString": "Parameter", + "name": "forefront", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The number of requests to add in one batch." + } + ] + }, + "defaultValue": "1000", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1626, + "kind": 32768, + "kindString": "Parameter", + "name": "batch_size", + "type": { + "name": "int", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Time to wait between adding batches." + } + ] + }, + "defaultValue": "timedelta(0)", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1627, + "kind": 32768, + "kindString": "Parameter", + "name": "wait_time_between_batches", + "type": { + "name": "timedelta", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, wait for all requests to be added before returning." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1628, + "kind": 32768, + "kindString": "Parameter", + "name": "wait_for_all_requests_to_be_added", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Timeout for waiting for all requests to be added." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1629, + "kind": 32768, + "kindString": "Parameter", + "name": "wait_for_all_requests_to_be_added_timeout", + "type": { + "name": "timedelta | None", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.add_requests", + "target": 1622, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.add_requests", + "target": 1622, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4186, + "module": "crawlers._basic._basic_crawler", + "name": "use_state", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 843 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 1631, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "use_state", + "parameters": [ + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1632, + "kind": 32768, + "kindString": "Parameter", + "name": "default_value", + "type": { + "name": "dict[str, JsonSerializable] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "dict", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "JsonSerializable" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "dict", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "JsonSerializable" + } + ] + }, + "inheritedFrom": { + "name": "BasicCrawler.use_state", + "target": 1630, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.use_state", + "target": 1630, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Retrieve data from a `Dataset`.\n\nThis helper method simplifies the process of retrieving data from a `Dataset`. It opens the specified\none and then retrieves the data based on the provided parameters.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4187, + "module": "crawlers._basic._basic_crawler", + "name": "get_data", + "parsedDocstring": { + "text": "Retrieve data from a `Dataset`.\n\nThis helper method simplifies the process of retrieving data from a `Dataset`. It opens the specified\none and then retrieves the data based on the provided parameters.\n", + "args": { + "dataset_id": "The ID of the `Dataset`.", + "dataset_name": "The name of the `Dataset` (global scope, named storage).", + "dataset_alias": "The alias of the `Dataset` (run scope, unnamed storage).", + "kwargs": "Keyword arguments to be passed to the `Dataset.get_data()` method.\n" + }, + "returns": "The retrieved data." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 854 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "The retrieved data." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Retrieve data from a `Dataset`.\n\nThis helper method simplifies the process of retrieving data from a `Dataset`. It opens the specified\none and then retrieves the data based on the provided parameters.\n" + } + ] + }, + "flags": {}, + "id": 1634, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_data", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The ID of the `Dataset`." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1635, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_id", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The name of the `Dataset` (global scope, named storage)." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1636, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_name", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The alias of the `Dataset` (run scope, unnamed storage)." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1637, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_alias", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Keyword arguments to be passed to the `Dataset.get_data()` method.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1638, + "kind": 32768, + "kindString": "Parameter", + "name": "kwargs", + "type": { + "name": "Unpack[GetDataKwargs]", + "type": "reference" + } + } + ], + "type": { + "name": "DatasetItemsListPage", + "type": "reference", + "target": "3658" + }, + "inheritedFrom": { + "name": "BasicCrawler.get_data", + "target": 1633, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.get_data", + "target": 1633, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Export all items from a Dataset to a JSON or CSV file.\n\nThis method simplifies the process of exporting data collected during crawling. It automatically\ndetermines the export format based on the file extension (`.json` or `.csv`) and handles\nthe conversion of `Dataset` items to the appropriate format.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4188, + "module": "crawlers._basic._basic_crawler", + "name": "export_data", + "parsedDocstring": { + "text": "Export all items from a Dataset to a JSON or CSV file.\n\nThis method simplifies the process of exporting data collected during crawling. It automatically\ndetermines the export format based on the file extension (`.json` or `.csv`) and handles\nthe conversion of `Dataset` items to the appropriate format.\n", + "args": { + "path": "The destination file path. Must end with '.json' or '.csv'.", + "dataset_id": "The ID of the Dataset to export from.", + "dataset_name": "The name of the Dataset to export from (global scope, named storage).", + "dataset_alias": "The alias of the Dataset to export from (run scope, unnamed storage).", + "additional_kwargs": "Extra keyword arguments forwarded to the JSON/CSV exporter depending on the file format." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 884 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Export all items from a Dataset to a JSON or CSV file.\n\nThis method simplifies the process of exporting data collected during crawling. It automatically\ndetermines the export format based on the file extension (`.json` or `.csv`) and handles\nthe conversion of `Dataset` items to the appropriate format.\n" + } + ] + }, + "flags": {}, + "id": 1640, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "export_data", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The destination file path. Must end with '.json' or '.csv'." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1641, + "kind": 32768, + "kindString": "Parameter", + "name": "path", + "type": { + "name": "str | Path", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The ID of the Dataset to export from." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1642, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_id", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The name of the Dataset to export from (global scope, named storage)." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1643, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_name", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The alias of the Dataset to export from (run scope, unnamed storage)." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1644, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_alias", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Extra keyword arguments forwarded to the JSON/CSV exporter depending on the file format." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1645, + "kind": 32768, + "kindString": "Parameter", + "name": "additional_kwargs", + "type": { + "name": "Unpack[ExportDataJsonKwargs | ExportDataCsvKwargs]", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.export_data", + "target": 1639, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.export_data", + "target": 1639, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A web crawler for performing HTTP requests and parsing HTML/XML content.\n\nThe `ParselCrawler` builds on top of the `AbstractHttpCrawler`, which means it inherits all of its features.\nIt specifies its own parser `ParselParser` which is used to parse `HttpResponse`.\n`ParselParser` uses following library for parsing: https://pypi.org/project/parsel/\n\nThe HTTP client-based crawlers are ideal for websites that do not require JavaScript execution. However,\nif you need to execute client-side JavaScript, consider using browser-based crawler like the `PlaywrightCrawler`.\n\n### Usage\n\n```python\nfrom crawlee.crawlers import ParselCrawler, ParselCrawlingContext\n\ncrawler = ParselCrawler()\n\n# Define the default request handler, which will be called for every request.\n@crawler.router.default_handler\nasync def request_handler(context: ParselCrawlingContext) -> None:\n context.log.info(f'Processing {context.request.url} ...')\n\n # Extract data from the page.\n data = {\n 'url': context.request.url,\n 'title': context.selector.css('title').get(),\n }\n\n # Push the extracted data to the default dataset.\n await context.push_data(data)\n\nawait crawler.run(['https://crawlee.dev/'])\n```" + } + ] + }, + "decorations": [ + { + "args": "('Crawlers')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 1713, + 4185, + 4171, + 4181, + 4188, + 4182, + 4187, + 4179, + 4180, + 4178, + 4183, + 4173, + 4172, + 4184, + 4177, + 4186 + ], + "title": "Methods" + }, + { + "children": [ + 4174, + 4175, + 4176 + ], + "title": "Properties" + } + ], + "id": 1712, + "module": "crawlers._parsel._parsel_crawler", + "name": "ParselCrawler", + "parsedDocstring": { + "text": "A web crawler for performing HTTP requests and parsing HTML/XML content.\n\nThe `ParselCrawler` builds on top of the `AbstractHttpCrawler`, which means it inherits all of its features.\nIt specifies its own parser `ParselParser` which is used to parse `HttpResponse`.\n`ParselParser` uses following library for parsing: https://pypi.org/project/parsel/\n\nThe HTTP client-based crawlers are ideal for websites that do not require JavaScript execution. However,\nif you need to execute client-side JavaScript, consider using browser-based crawler like the `PlaywrightCrawler`.\n\n### Usage\n\n```python\nfrom crawlee.crawlers import ParselCrawler, ParselCrawlingContext\n\ncrawler = ParselCrawler()\n\n# Define the default request handler, which will be called for every request.\n@crawler.router.default_handler\nasync def request_handler(context: ParselCrawlingContext) -> None:\n context.log.info(f'Processing {context.request.url} ...')\n\n # Extract data from the page.\n data = {\n 'url': context.request.url,\n 'title': context.selector.css('title').get(),\n }\n\n # Push the extracted data to the default dataset.\n await context.push_data(data)\n\nawait crawler.run(['https://crawlee.dev/'])\n```" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_parsel/_parsel_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 22 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "AbstractHttpCrawler", + "target": "1266", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Convenience alias." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 1717, + "module": "crawlers._parsel._parsel_crawling_context", + "name": "selector", + "parsedDocstring": { + "text": "Convenience alias." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_parsel/_parsel_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 21 + } + ], + "type": { + "name": "Selector", + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a new context from an existing `ParsedHttpCrawlingContext[Selector]`." + } + ] + }, + "decorations": [ + { + "name": "classmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1718, + "module": "crawlers._parsel._parsel_crawling_context", + "name": "from_parsed_http_crawling_context", + "parsedDocstring": { + "text": "Create a new context from an existing `ParsedHttpCrawlingContext[Selector]`." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_parsel/_parsel_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 26 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a new context from an existing `ParsedHttpCrawlingContext[Selector]`." + } + ] + }, + "flags": {}, + "id": 1719, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "from_parsed_http_crawling_context", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1720, + "kind": 32768, + "kindString": "Parameter", + "name": "context", + "type": { + "name": "ParsedHttpCrawlingContext", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Selector" + } + ], + "target": "1252" + } + } + ], + "type": { + "name": "Self", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Convert the parsed HTML content to newline-separated plain text without tags." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1721, + "module": "crawlers._parsel._parsel_crawling_context", + "name": "html_to_text", + "parsedDocstring": { + "text": "Convert the parsed HTML content to newline-separated plain text without tags." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_parsel/_parsel_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 30 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Convert the parsed HTML content to newline-separated plain text without tags." + } + ] + }, + "flags": {}, + "id": 1722, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "html_to_text", + "parameters": [], + "type": { + "name": "str", + "type": "reference" + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 4373, + "module": "crawlers._abstract_http._http_crawling_context", + "name": "parsed_content", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 40 + } + ], + "type": { + "name": "TParseResult", + "type": "reference", + "target": "1243" + }, + "inheritedFrom": { + "name": "ParsedHttpCrawlingContext.parsed_content", + "target": 1253, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 4374, + "module": "crawlers._abstract_http._http_crawling_context", + "name": "enqueue_links", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 41 + } + ], + "type": { + "name": "EnqueueLinksFunction", + "type": "reference", + "target": "430" + }, + "inheritedFrom": { + "name": "ParsedHttpCrawlingContext.enqueue_links", + "target": 1254, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 4375, + "module": "crawlers._abstract_http._http_crawling_context", + "name": "extract_links", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 42 + } + ], + "type": { + "name": "ExtractLinksFunction", + "type": "reference", + "target": "459" + }, + "inheritedFrom": { + "name": "ParsedHttpCrawlingContext.extract_links", + "target": 1255, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance from an existing `HttpCrawlingContext`." + } + ] + }, + "decorations": [ + { + "name": "classmethod" + } + ], + "flags": {}, + "groups": [], + "id": 4376, + "module": "crawlers._abstract_http._http_crawling_context", + "name": "from_http_crawling_context", + "parsedDocstring": { + "text": "Initialize a new instance from an existing `HttpCrawlingContext`." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 45 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance from an existing `HttpCrawlingContext`." + } + ] + }, + "flags": {}, + "id": 1257, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "from_http_crawling_context", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1258, + "kind": 32768, + "kindString": "Parameter", + "name": "context", + "type": { + "name": "HttpCrawlingContext", + "type": "reference", + "target": "1245" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1259, + "kind": 32768, + "kindString": "Parameter", + "name": "parsed_content", + "type": { + "name": "TParseResult", + "type": "reference", + "target": "1243" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1260, + "kind": 32768, + "kindString": "Parameter", + "name": "enqueue_links", + "type": { + "name": "EnqueueLinksFunction", + "type": "reference", + "target": "430" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1261, + "kind": 32768, + "kindString": "Parameter", + "name": "extract_links", + "type": { + "name": "ExtractLinksFunction", + "type": "reference", + "target": "459" + } + } + ], + "type": { + "name": "Self", + "type": "reference" + }, + "inheritedFrom": { + "name": "ParsedHttpCrawlingContext.from_http_crawling_context", + "target": 1256, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "ParsedHttpCrawlingContext.from_http_crawling_context", + "target": 1256, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance from an existing `BasicCrawlingContext`." + } + ] + }, + "decorations": [ + { + "name": "classmethod" + } + ], + "flags": {}, + "groups": [], + "id": 4377, + "module": "crawlers._abstract_http._http_crawling_context", + "name": "from_basic_crawling_context", + "parsedDocstring": { + "text": "Initialize a new instance from an existing `BasicCrawlingContext`." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 22 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance from an existing `BasicCrawlingContext`." + } + ] + }, + "flags": {}, + "id": 1247, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "from_basic_crawling_context", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1248, + "kind": 32768, + "kindString": "Parameter", + "name": "context", + "type": { + "name": "BasicCrawlingContext", + "type": "reference", + "target": "504" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1249, + "kind": 32768, + "kindString": "Parameter", + "name": "http_response", + "type": { + "name": "HttpResponse", + "type": "reference", + "target": "2060" + } + } + ], + "type": { + "name": "Self", + "type": "reference" + }, + "inheritedFrom": { + "name": "HttpCrawlingContext.from_basic_crawling_context", + "target": 1246, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "HttpCrawlingContext.from_basic_crawling_context", + "target": 1246, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get snapshot of crawled page." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4378, + "module": "crawlers._abstract_http._http_crawling_context", + "name": "get_snapshot", + "parsedDocstring": { + "text": "Get snapshot of crawled page." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 27 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get snapshot of crawled page." + } + ] + }, + "flags": {}, + "id": 1251, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_snapshot", + "parameters": [], + "type": { + "name": "PageSnapshot", + "type": "reference", + "target": "495" + }, + "overwrites": { + "name": "BasicCrawlingContext.get_snapshot", + "target": 514, + "type": "reference" + }, + "inheritedFrom": { + "name": "HttpCrawlingContext.get_snapshot", + "target": 1250, + "type": "reference" + } + } + ], + "overwrites": { + "name": "BasicCrawlingContext.get_snapshot", + "target": 514, + "type": "reference" + }, + "inheritedFrom": { + "name": "HttpCrawlingContext.get_snapshot", + "target": 1250, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The HTTP response received from the server." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4379, + "module": "http_clients._base", + "name": "http_response", + "parsedDocstring": { + "text": "The HTTP response received from the server." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/http_clients/_base.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 70 + } + ], + "type": { + "name": "HttpResponse", + "type": "reference", + "target": "2060" + }, + "inheritedFrom": { + "name": "HttpCrawlingResult.http_response", + "target": 2069, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Request object for the current page being processed." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4380, + "module": "_types", + "name": "request", + "parsedDocstring": { + "text": "Request object for the current page being processed." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 637 + } + ], + "type": { + "name": "Request", + "type": "reference", + "target": "150" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.request", + "target": 505, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Session object for the current page being processed." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4381, + "module": "_types", + "name": "session", + "parsedDocstring": { + "text": "Session object for the current page being processed." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 640 + } + ], + "type": { + "name": "Session | None", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.session", + "target": 506, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Proxy information for the current page being processed." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4382, + "module": "_types", + "name": "proxy_info", + "parsedDocstring": { + "text": "Proxy information for the current page being processed." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 643 + } + ], + "type": { + "name": "ProxyInfo | None", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.proxy_info", + "target": 507, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Send request crawling context helper function." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4383, + "module": "_types", + "name": "send_request", + "parsedDocstring": { + "text": "Send request crawling context helper function." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 646 + } + ], + "type": { + "name": "SendRequestFunction", + "type": "reference", + "target": "488" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.send_request", + "target": 508, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Add requests crawling context helper function." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4384, + "module": "_types", + "name": "add_requests", + "parsedDocstring": { + "text": "Add requests crawling context helper function." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 649 + } + ], + "type": { + "name": "AddRequestsFunction", + "type": "reference", + "target": "422" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.add_requests", + "target": 509, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Push data crawling context helper function." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4385, + "module": "_types", + "name": "push_data", + "parsedDocstring": { + "text": "Push data crawling context helper function." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 652 + } + ], + "type": { + "name": "PushDataFunction", + "type": "reference", + "target": "480" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.push_data", + "target": 510, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Use state crawling context helper function." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4386, + "module": "_types", + "name": "use_state", + "parsedDocstring": { + "text": "Use state crawling context helper function." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 655 + } + ], + "type": { + "name": "UseStateFunction", + "type": "reference", + "target": "500" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.use_state", + "target": 511, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get key-value store crawling context helper function." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4387, + "module": "_types", + "name": "get_key_value_store", + "parsedDocstring": { + "text": "Get key-value store crawling context helper function." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 658 + } + ], + "type": { + "name": "GetKeyValueStoreFromRequestHandlerFunction", + "type": "reference", + "target": "474" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.get_key_value_store", + "target": 512, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Logger instance." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4388, + "module": "_types", + "name": "log", + "parsedDocstring": { + "text": "Logger instance." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 661 + } + ], + "type": { + "name": "logging.Logger", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.log", + "target": 513, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return hash of the context. Each context is considered unique." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4389, + "module": "_types", + "name": "__hash__", + "parsedDocstring": { + "text": "Return hash of the context. Each context is considered unique." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 668 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return hash of the context. Each context is considered unique." + } + ] + }, + "flags": {}, + "id": 517, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__hash__", + "parameters": [], + "type": { + "name": "int", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.__hash__", + "target": 516, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawlingContext.__hash__", + "target": 516, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a modified copy of the crawling context with specified changes." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4390, + "module": "_types", + "name": "create_modified_copy", + "parsedDocstring": { + "text": "Create a modified copy of the crawling context with specified changes." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 672 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a modified copy of the crawling context with specified changes." + } + ] + }, + "flags": {}, + "id": 519, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "create_modified_copy", + "parameters": [ + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 520, + "kind": 32768, + "kindString": "Parameter", + "name": "push_data", + "type": { + "name": "PushDataFunction | None", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 521, + "kind": 32768, + "kindString": "Parameter", + "name": "add_requests", + "type": { + "name": "AddRequestsFunction | None", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 522, + "kind": 32768, + "kindString": "Parameter", + "name": "get_key_value_store", + "type": { + "name": "GetKeyValueStoreFromRequestHandlerFunction | None", + "type": "reference" + } + } + ], + "type": { + "name": "Self", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.create_modified_copy", + "target": 518, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawlingContext.create_modified_copy", + "target": 518, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The crawling context used by the `ParselCrawler`.\n\nIt provides access to key objects as well as utility functions for handling crawling tasks." + } + ] + }, + "decorations": [ + { + "args": "(frozen=True)", + "name": "dataclass" + }, + { + "args": "('Crawling contexts')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 4389, + 4390, + 4377, + 4376, + 1718, + 4378, + 1721 + ], + "title": "Methods" + }, + { + "children": [ + 4384, + 4374, + 4375, + 4387, + 4379, + 4388, + 4373, + 4382, + 4385, + 4380, + 1717, + 4383, + 4381, + 4386 + ], + "title": "Properties" + } + ], + "id": 1716, + "module": "crawlers._parsel._parsel_crawling_context", + "name": "ParselCrawlingContext", + "parsedDocstring": { + "text": "The crawling context used by the `ParselCrawler`.\n\nIt provides access to key objects as well as utility functions for handling crawling tasks." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_parsel/_parsel_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 14 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "ParsedHttpCrawlingContext", + "target": "1252", + "type": "reference" + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Convert markup string or `Selector` to newline-separated plain text without tags using Parsel.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1723, + "module": "crawlers._parsel._utils", + "name": "html_to_text", + "parsedDocstring": { + "text": "Convert markup string or `Selector` to newline-separated plain text without tags using Parsel.\n", + "args": { + "source": "Input markup string or `Selector` object.\n" + }, + "returns": "Newline separated plain text without tags." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_parsel/_utils.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 16 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "Newline separated plain text without tags." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Convert markup string or `Selector` to newline-separated plain text without tags using Parsel.\n" + } + ] + }, + "flags": {}, + "id": 1724, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "html_to_text", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Input markup string or `Selector` object.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1725, + "kind": 32768, + "kindString": "Parameter", + "name": "source", + "type": { + "name": "str | Selector", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Selector" + } + ] + } + } + ], + "type": { + "name": "str", + "type": "reference" + } + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Parse HTTP response.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1727, + "module": "crawlers._abstract_http._abstract_http_parser", + "name": "parse", + "parsedDocstring": { + "text": "Parse HTTP response.\n", + "args": { + "response": "HTTP response to be parsed.\n" + }, + "returns": "Parsed HTTP response." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_parsel/_parsel_parser.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 23 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "Parsed HTTP response." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Parse HTTP response.\n" + } + ] + }, + "flags": {}, + "id": 1283, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "parse", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP response to be parsed.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1284, + "kind": 32768, + "kindString": "Parameter", + "name": "response", + "type": { + "name": "HttpResponse", + "type": "reference", + "target": "2060" + } + } + ], + "type": { + "name": "TParseResult", + "type": "reference", + "target": "1243" + }, + "overwrites": { + "name": "AbstractHttpParser.parse", + "target": 1282, + "type": "reference" + } + } + ], + "overwrites": { + "name": "AbstractHttpParser.parse", + "target": 1282, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Parse text containing html.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1730, + "module": "crawlers._abstract_http._abstract_http_parser", + "name": "parse_text", + "parsedDocstring": { + "text": "Parse text containing html.\n", + "args": { + "text": "String containing html.\n" + }, + "returns": "Parsed text." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_parsel/_parsel_parser.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 28 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "Parsed text." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Parse text containing html.\n" + } + ] + }, + "flags": {}, + "id": 1286, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "parse_text", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "String containing html.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1287, + "kind": 32768, + "kindString": "Parameter", + "name": "text", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "TParseResult", + "type": "reference", + "target": "1243" + }, + "overwrites": { + "name": "AbstractHttpParser.parse_text", + "target": 1285, + "type": "reference" + } + } + ], + "overwrites": { + "name": "AbstractHttpParser.parse_text", + "target": 1285, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Use css selector to select page element and return it.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1733, + "module": "crawlers._abstract_http._abstract_http_parser", + "name": "select", + "parsedDocstring": { + "text": "Use css selector to select page element and return it.\n", + "args": { + "parsed_content": "Content where the page element will be located.", + "selector": "Css selector used to locate desired html element.\n" + }, + "returns": "Selected element." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_parsel/_parsel_parser.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 32 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "Selected element." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Use css selector to select page element and return it.\n" + } + ] + }, + "flags": {}, + "id": 1289, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "select", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Content where the page element will be located." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1290, + "kind": 32768, + "kindString": "Parameter", + "name": "parsed_content", + "type": { + "name": "TParseResult", + "type": "reference", + "target": "1243" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Css selector used to locate desired html element.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1291, + "kind": 32768, + "kindString": "Parameter", + "name": "selector", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "Sequence", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "TSelectResult", + "target": "1244" + } + ] + }, + "overwrites": { + "name": "AbstractHttpParser.select", + "target": 1288, + "type": "reference" + } + } + ], + "overwrites": { + "name": "AbstractHttpParser.select", + "target": 1288, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Find if selector has match in parsed content.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1737, + "module": "crawlers._abstract_http._abstract_http_parser", + "name": "is_matching_selector", + "parsedDocstring": { + "text": "Find if selector has match in parsed content.\n", + "args": { + "parsed_content": "Parsed HTTP response. Result of `parse` method.", + "selector": "String used to define matching pattern.\n" + }, + "returns": "True if selector has match in parsed content." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_parsel/_parsel_parser.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 36 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "True if selector has match in parsed content." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Find if selector has match in parsed content.\n" + } + ] + }, + "flags": {}, + "id": 1296, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "is_matching_selector", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Parsed HTTP response. Result of `parse` method." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1297, + "kind": 32768, + "kindString": "Parameter", + "name": "parsed_content", + "type": { + "name": "TParseResult", + "type": "reference", + "target": "1243" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "String used to define matching pattern.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1298, + "kind": 32768, + "kindString": "Parameter", + "name": "selector", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "bool", + "type": "reference" + }, + "overwrites": { + "name": "AbstractHttpParser.is_matching_selector", + "target": 1295, + "type": "reference" + } + } + ], + "overwrites": { + "name": "AbstractHttpParser.is_matching_selector", + "target": 1295, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Find all links in result using selector.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1741, + "module": "crawlers._abstract_http._abstract_http_parser", + "name": "find_links", + "parsedDocstring": { + "text": "Find all links in result using selector.\n", + "args": { + "parsed_content": "Parsed HTTP response. Result of `parse` method.", + "selector": "String used to define matching pattern for finding links.", + "attribute": "Which node attribute to extract the links from.\n" + }, + "returns": "Iterable of strings that contain found links." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_parsel/_parsel_parser.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 40 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "Iterable of strings that contain found links." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Find all links in result using selector.\n" + } + ] + }, + "flags": {}, + "id": 1300, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "find_links", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Parsed HTTP response. Result of `parse` method." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1301, + "kind": 32768, + "kindString": "Parameter", + "name": "parsed_content", + "type": { + "name": "TParseResult", + "type": "reference", + "target": "1243" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "String used to define matching pattern for finding links." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1302, + "kind": 32768, + "kindString": "Parameter", + "name": "selector", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Which node attribute to extract the links from.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1303, + "kind": 32768, + "kindString": "Parameter", + "name": "attribute", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "Iterable", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + }, + "overwrites": { + "name": "AbstractHttpParser.find_links", + "target": 1299, + "type": "reference" + } + } + ], + "overwrites": { + "name": "AbstractHttpParser.find_links", + "target": 1299, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Detect if blocked and return BlockedInfo with additional information.\n\nDefault implementation that expects `is_matching_selector` abstract method to be implemented.\nOverride this method if your parser has different way of blockage detection.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4028, + "module": "crawlers._abstract_http._abstract_http_parser", + "name": "is_blocked", + "parsedDocstring": { + "text": "Detect if blocked and return BlockedInfo with additional information.\n\nDefault implementation that expects `is_matching_selector` abstract method to be implemented.\nOverride this method if your parser has different way of blockage detection.\n", + "args": { + "parsed_content": "Parsed HTTP response. Result of `parse` method.\n" + }, + "returns": "`BlockedInfo` object that contains non-empty string description of reason if blockage was detected. Empty\nstring in reason signifies no blockage detected." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_abstract_http/_abstract_http_parser.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 56 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "`BlockedInfo` object that contains non-empty string description of reason if blockage was detected. Empty\nstring in reason signifies no blockage detected." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Detect if blocked and return BlockedInfo with additional information.\n\nDefault implementation that expects `is_matching_selector` abstract method to be implemented.\nOverride this method if your parser has different way of blockage detection.\n" + } + ] + }, + "flags": {}, + "id": 1293, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "is_blocked", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Parsed HTTP response. Result of `parse` method.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1294, + "kind": 32768, + "kindString": "Parameter", + "name": "parsed_content", + "type": { + "name": "TParseResult", + "type": "reference", + "target": "1243" + } + } + ], + "type": { + "name": "BlockedInfo", + "type": "reference", + "target": "1848" + }, + "inheritedFrom": { + "name": "AbstractHttpParser.is_blocked", + "target": 1292, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "AbstractHttpParser.is_blocked", + "target": 1292, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Parser for parsing HTTP response using Parsel." + } + ] + }, + "decorations": [ + { + "args": "('HTTP parsers')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 1741, + 4028, + 1737, + 1727, + 1730, + 1733 + ], + "title": "Methods" + } + ], + "id": 1726, + "module": "crawlers._parsel._parsel_parser", + "name": "ParselParser", + "parsedDocstring": { + "text": "Parser for parsing HTTP response using Parsel." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_parsel/_parsel_parser.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 19 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "AbstractHttpParser", + "target": "1281", + "type": "reference" + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Asynchronous context manager for setting the current Playwright page in the context variable." + } + ] + }, + "decorations": [ + { + "name": "asynccontextmanager" + } + ], + "flags": {}, + "groups": [], + "id": 1746, + "module": "crawlers._playwright._playwright_http_client", + "name": "browser_page_context", + "parsedDocstring": { + "text": "Asynchronous context manager for setting the current Playwright page in the context variable." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_playwright/_playwright_http_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 30 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Asynchronous context manager for setting the current Playwright page in the context variable." + } + ] + }, + "flags": {}, + "id": 1747, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "browser_page_context", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1748, + "kind": 32768, + "kindString": "Parameter", + "name": "page", + "type": { + "name": "Page", + "type": "reference" + } + } + ], + "type": { + "name": "AsyncGenerator", + "type": "reference", + "typeArguments": [ + { + "type": "literal", + "value": null + }, + { + "type": "literal", + "value": null + } + ] + } + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1750, + "module": "crawlers._playwright._playwright_http_client", + "name": "__init__", + "parsedDocstring": { + "text": "Initialize a new instance." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_playwright/_playwright_http_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 50 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance." + } + ] + }, + "flags": {}, + "id": 1751, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "HttpClient.__init__", + "target": 2071, + "type": "reference" + } + } + ], + "overwrites": { + "name": "HttpClient.__init__", + "target": 2071, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Perform the crawling for a given request.\n\nThis method is called from `crawler.run()`.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1752, + "module": "http_clients._base", + "name": "crawl", + "parsedDocstring": { + "text": "Perform the crawling for a given request.\n\nThis method is called from `crawler.run()`.\n", + "args": { + "request": "The request to be crawled.", + "session": "The session associated with the request.", + "proxy_info": "The information about the proxy to be used.", + "statistics": "The statistics object to register status codes.", + "timeout": "Maximum time allowed to process the request.\n" + }, + "returns": "The result of the crawling." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_playwright/_playwright_http_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 55 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "The result of the crawling." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Perform the crawling for a given request.\n\nThis method is called from `crawler.run()`.\n" + } + ] + }, + "flags": {}, + "id": 2076, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "crawl", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The request to be crawled." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2077, + "kind": 32768, + "kindString": "Parameter", + "name": "request", + "type": { + "name": "Request", + "type": "reference", + "target": "150" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The session associated with the request." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2078, + "kind": 32768, + "kindString": "Parameter", + "name": "session", + "type": { + "name": "Session | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Session", + "target": "2445" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The information about the proxy to be used." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2079, + "kind": 32768, + "kindString": "Parameter", + "name": "proxy_info", + "type": { + "name": "ProxyInfo | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ProxyInfo", + "target": "254" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The statistics object to register status codes." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2080, + "kind": 32768, + "kindString": "Parameter", + "name": "statistics", + "type": { + "name": "Statistics | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Statistics", + "target": "2707" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum time allowed to process the request.\n" + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2081, + "kind": 32768, + "kindString": "Parameter", + "name": "timeout", + "type": { + "name": "timedelta | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "timedelta" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "HttpCrawlingResult", + "type": "reference", + "target": "2068" + }, + "overwrites": { + "name": "HttpClient.crawl", + "target": 2075, + "type": "reference" + } + } + ], + "overwrites": { + "name": "HttpClient.crawl", + "target": 2075, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Send an HTTP request via the client.\n\nThis method is called from `context.send_request()` helper.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1759, + "module": "http_clients._base", + "name": "send_request", + "parsedDocstring": { + "text": "Send an HTTP request via the client.\n\nThis method is called from `context.send_request()` helper.\n", + "args": { + "url": "The URL to send the request to.", + "method": "The HTTP method to use.", + "headers": "The headers to include in the request.", + "payload": "The data to be sent as the request body.", + "session": "The session associated with the request.", + "proxy_info": "The information about the proxy to be used.", + "timeout": "Maximum time allowed to process the request.\n" + }, + "returns": "The HTTP response received from the server." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_playwright/_playwright_http_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 67 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "The HTTP response received from the server." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Send an HTTP request via the client.\n\nThis method is called from `context.send_request()` helper.\n" + } + ] + }, + "flags": {}, + "id": 2083, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "send_request", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The URL to send the request to." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2084, + "kind": 32768, + "kindString": "Parameter", + "name": "url", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The HTTP method to use." + } + ] + }, + "defaultValue": "'GET'", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2085, + "kind": 32768, + "kindString": "Parameter", + "name": "method", + "type": { + "name": "HttpMethod", + "type": "reference", + "target": "300" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The headers to include in the request." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2086, + "kind": 32768, + "kindString": "Parameter", + "name": "headers", + "type": { + "name": "HttpHeaders | dict[str, str] | None", + "type": "union", + "types": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "HttpHeaders", + "target": "306" + }, + { + "type": "reference", + "name": "dict", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "str" + } + ] + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The data to be sent as the request body." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2087, + "kind": 32768, + "kindString": "Parameter", + "name": "payload", + "type": { + "name": "HttpPayload | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "HttpPayload", + "target": "301" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The session associated with the request." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2088, + "kind": 32768, + "kindString": "Parameter", + "name": "session", + "type": { + "name": "Session | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Session", + "target": "2445" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The information about the proxy to be used." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2089, + "kind": 32768, + "kindString": "Parameter", + "name": "proxy_info", + "type": { + "name": "ProxyInfo | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ProxyInfo", + "target": "254" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum time allowed to process the request.\n" + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2090, + "kind": 32768, + "kindString": "Parameter", + "name": "timeout", + "type": { + "name": "timedelta | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "timedelta" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "HttpResponse", + "type": "reference", + "target": "2060" + }, + "overwrites": { + "name": "HttpClient.send_request", + "target": 2082, + "type": "reference" + } + } + ], + "overwrites": { + "name": "HttpClient.send_request", + "target": 2082, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Stream an HTTP request via the client.\n\nThis method should be used for downloading potentially large data where you need to process\nthe response body in chunks rather than loading it entirely into memory.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1768, + "module": "http_clients._base", + "name": "stream", + "parsedDocstring": { + "text": "Stream an HTTP request via the client.\n\nThis method should be used for downloading potentially large data where you need to process\nthe response body in chunks rather than loading it entirely into memory.\n", + "args": { + "url": "The URL to send the request to.", + "method": "The HTTP method to use.", + "headers": "The headers to include in the request.", + "payload": "The data to be sent as the request body.", + "session": "The session associated with the request.", + "proxy_info": "The information about the proxy to be used.", + "timeout": "The maximum time to wait for establishing the connection.\n" + }, + "returns": "An async context manager yielding the HTTP response with streaming capabilities." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_playwright/_playwright_http_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 102 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "An async context manager yielding the HTTP response with streaming capabilities." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Stream an HTTP request via the client.\n\nThis method should be used for downloading potentially large data where you need to process\nthe response body in chunks rather than loading it entirely into memory.\n" + } + ] + }, + "flags": {}, + "id": 2092, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "stream", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The URL to send the request to." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2093, + "kind": 32768, + "kindString": "Parameter", + "name": "url", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The HTTP method to use." + } + ] + }, + "defaultValue": "'GET'", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2094, + "kind": 32768, + "kindString": "Parameter", + "name": "method", + "type": { + "name": "HttpMethod", + "type": "reference", + "target": "300" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The headers to include in the request." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2095, + "kind": 32768, + "kindString": "Parameter", + "name": "headers", + "type": { + "name": "HttpHeaders | dict[str, str] | None", + "type": "union", + "types": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "HttpHeaders", + "target": "306" + }, + { + "type": "reference", + "name": "dict", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "str" + } + ] + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The data to be sent as the request body." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2096, + "kind": 32768, + "kindString": "Parameter", + "name": "payload", + "type": { + "name": "HttpPayload | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "HttpPayload", + "target": "301" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The session associated with the request." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2097, + "kind": 32768, + "kindString": "Parameter", + "name": "session", + "type": { + "name": "Session | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Session", + "target": "2445" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The information about the proxy to be used." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2098, + "kind": 32768, + "kindString": "Parameter", + "name": "proxy_info", + "type": { + "name": "ProxyInfo | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ProxyInfo", + "target": "254" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The maximum time to wait for establishing the connection.\n" + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2099, + "kind": 32768, + "kindString": "Parameter", + "name": "timeout", + "type": { + "name": "timedelta | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "timedelta" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "AbstractAsyncContextManager", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "HttpResponse", + "target": "2060" + } + ] + }, + "overwrites": { + "name": "HttpClient.stream", + "target": 2091, + "type": "reference" + } + } + ], + "overwrites": { + "name": "HttpClient.stream", + "target": 2091, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Clean up resources used by the client.\n\nThis method is called when the client is no longer needed and should be overridden\nin subclasses to perform any necessary cleanup such as closing connections,\nreleasing file handles, or other resource deallocation." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1777, + "module": "http_clients._base", + "name": "cleanup", + "parsedDocstring": { + "text": "Clean up resources used by the client.\n\nThis method is called when the client is no longer needed and should be overridden\nin subclasses to perform any necessary cleanup such as closing connections,\nreleasing file handles, or other resource deallocation." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_playwright/_playwright_http_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 115 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Clean up resources used by the client.\n\nThis method is called when the client is no longer needed and should be overridden\nin subclasses to perform any necessary cleanup such as closing connections,\nreleasing file handles, or other resource deallocation." + } + ] + }, + "flags": {}, + "id": 2101, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "cleanup", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "HttpClient.cleanup", + "target": 2100, + "type": "reference" + } + } + ], + "overwrites": { + "name": "HttpClient.cleanup", + "target": 2100, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Indicate whether the context is active." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 4015, + "module": "http_clients._base", + "name": "active", + "parsedDocstring": { + "text": "Indicate whether the context is active." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/http_clients/_base.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 95 + } + ], + "type": { + "name": "bool", + "type": "reference" + }, + "inheritedFrom": { + "name": "HttpClient.active", + "target": 2074, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize the client when entering the context manager.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4016, + "module": "http_clients._base", + "name": "__aenter__", + "parsedDocstring": { + "text": "Initialize the client when entering the context manager.\n" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/http_clients/_base.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 201 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize the client when entering the context manager.\n" + } + ] + }, + "flags": {}, + "id": 2103, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "__aenter__", + "parameters": [], + "type": { + "name": "HttpClient", + "type": "reference", + "target": "2070" + }, + "inheritedFrom": { + "name": "HttpClient.__aenter__", + "target": 2102, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "HttpClient.__aenter__", + "target": 2102, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Deinitialize the client and clean up resources when exiting the context manager.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4017, + "module": "http_clients._base", + "name": "__aexit__", + "parsedDocstring": { + "text": "Deinitialize the client and clean up resources when exiting the context manager.\n" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/http_clients/_base.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 213 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Deinitialize the client and clean up resources when exiting the context manager.\n" + } + ] + }, + "flags": {}, + "id": 2105, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "__aexit__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2106, + "kind": 32768, + "kindString": "Parameter", + "name": "exc_type", + "type": { + "name": "BaseException | None", + "type": "reference" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2107, + "kind": 32768, + "kindString": "Parameter", + "name": "exc_value", + "type": { + "name": "BaseException | None", + "type": "reference" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2108, + "kind": 32768, + "kindString": "Parameter", + "name": "traceback", + "type": { + "name": "TracebackType | None", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "HttpClient.__aexit__", + "target": 2104, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "HttpClient.__aexit__", + "target": 2104, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP client based on the Playwright library.\n\nThis client uses the Playwright library to perform HTTP requests in crawlers (`BasicCrawler` subclasses)\nand to manage sessions, proxies, and error handling.\n\nSee the `HttpClient` class for more common information about HTTP clients.\n\nNote: This class is pre-designated for use in `PlaywrightCrawler` only" + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 4016, + 4017, + 1750, + 1777, + 1752, + 1759, + 1768 + ], + "title": "Methods" + }, + { + "children": [ + 4015 + ], + "title": "Properties" + } + ], + "id": 1749, + "module": "crawlers._playwright._playwright_http_client", + "name": "PlaywrightHttpClient", + "parsedDocstring": { + "text": "HTTP client based on the Playwright library.\n\nThis client uses the Playwright library to perform HTTP requests in crawlers (`BasicCrawler` subclasses)\nand to manage sessions, proxies, and error handling.\n\nSee the `HttpClient` class for more common information about HTTP clients.\n\nNote: This class is pre-designated for use in `PlaywrightCrawler` only" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_playwright/_playwright_http_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 39 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "HttpClient", + "target": "2070", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The Playwright `Page` object for the current page." + } + ] + }, + "flags": {}, + "groups": [], + "id": 1780, + "module": "crawlers._playwright._playwright_pre_nav_crawling_context", + "name": "page", + "parsedDocstring": { + "text": "The Playwright `Page` object for the current page." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 23 + } + ], + "type": { + "name": "Page", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Blocks network requests matching specified URL patterns." + } + ] + }, + "flags": {}, + "groups": [], + "id": 1781, + "module": "crawlers._playwright._playwright_pre_nav_crawling_context", + "name": "block_requests", + "parsedDocstring": { + "text": "Blocks network requests matching specified URL patterns." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 26 + } + ], + "type": { + "name": "BlockRequestsFunction", + "type": "reference", + "target": "1793" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Additional options to pass to Playwright's `Page.goto()` method. The `timeout` option is not supported." + } + ] + }, + "flags": {}, + "groups": [], + "id": 1782, + "module": "crawlers._playwright._playwright_pre_nav_crawling_context", + "name": "goto_options", + "parsedDocstring": { + "text": "Additional options to pass to Playwright's `Page.goto()` method. The `timeout` option is not supported." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 29 + } + ], + "type": { + "name": "GotoOptions", + "type": "reference", + "target": "1810" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get snapshot of crawled page." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1783, + "module": "crawlers._playwright._playwright_pre_nav_crawling_context", + "name": "get_snapshot", + "parsedDocstring": { + "text": "Get snapshot of crawled page." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 32 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get snapshot of crawled page." + } + ] + }, + "flags": {}, + "id": 1784, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_snapshot", + "parameters": [], + "type": { + "name": "PageSnapshot", + "type": "reference", + "target": "495" + }, + "overwrites": { + "name": "BasicCrawlingContext.get_snapshot", + "target": 514, + "type": "reference" + } + } + ], + "overwrites": { + "name": "BasicCrawlingContext.get_snapshot", + "target": 514, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Request object for the current page being processed." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4267, + "module": "_types", + "name": "request", + "parsedDocstring": { + "text": "Request object for the current page being processed." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 637 + } + ], + "type": { + "name": "Request", + "type": "reference", + "target": "150" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.request", + "target": 505, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Session object for the current page being processed." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4268, + "module": "_types", + "name": "session", + "parsedDocstring": { + "text": "Session object for the current page being processed." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 640 + } + ], + "type": { + "name": "Session | None", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.session", + "target": 506, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Proxy information for the current page being processed." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4269, + "module": "_types", + "name": "proxy_info", + "parsedDocstring": { + "text": "Proxy information for the current page being processed." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 643 + } + ], + "type": { + "name": "ProxyInfo | None", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.proxy_info", + "target": 507, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Send request crawling context helper function." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4270, + "module": "_types", + "name": "send_request", + "parsedDocstring": { + "text": "Send request crawling context helper function." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 646 + } + ], + "type": { + "name": "SendRequestFunction", + "type": "reference", + "target": "488" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.send_request", + "target": 508, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Add requests crawling context helper function." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4271, + "module": "_types", + "name": "add_requests", + "parsedDocstring": { + "text": "Add requests crawling context helper function." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 649 + } + ], + "type": { + "name": "AddRequestsFunction", + "type": "reference", + "target": "422" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.add_requests", + "target": 509, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Push data crawling context helper function." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4272, + "module": "_types", + "name": "push_data", + "parsedDocstring": { + "text": "Push data crawling context helper function." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 652 + } + ], + "type": { + "name": "PushDataFunction", + "type": "reference", + "target": "480" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.push_data", + "target": 510, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Use state crawling context helper function." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4273, + "module": "_types", + "name": "use_state", + "parsedDocstring": { + "text": "Use state crawling context helper function." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 655 + } + ], + "type": { + "name": "UseStateFunction", + "type": "reference", + "target": "500" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.use_state", + "target": 511, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get key-value store crawling context helper function." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4274, + "module": "_types", + "name": "get_key_value_store", + "parsedDocstring": { + "text": "Get key-value store crawling context helper function." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 658 + } + ], + "type": { + "name": "GetKeyValueStoreFromRequestHandlerFunction", + "type": "reference", + "target": "474" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.get_key_value_store", + "target": 512, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Logger instance." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4275, + "module": "_types", + "name": "log", + "parsedDocstring": { + "text": "Logger instance." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 661 + } + ], + "type": { + "name": "logging.Logger", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.log", + "target": 513, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return hash of the context. Each context is considered unique." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4276, + "module": "_types", + "name": "__hash__", + "parsedDocstring": { + "text": "Return hash of the context. Each context is considered unique." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 668 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return hash of the context. Each context is considered unique." + } + ] + }, + "flags": {}, + "id": 517, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__hash__", + "parameters": [], + "type": { + "name": "int", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.__hash__", + "target": 516, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawlingContext.__hash__", + "target": 516, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a modified copy of the crawling context with specified changes." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4277, + "module": "_types", + "name": "create_modified_copy", + "parsedDocstring": { + "text": "Create a modified copy of the crawling context with specified changes." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 672 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a modified copy of the crawling context with specified changes." + } + ] + }, + "flags": {}, + "id": 519, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "create_modified_copy", + "parameters": [ + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 520, + "kind": 32768, + "kindString": "Parameter", + "name": "push_data", + "type": { + "name": "PushDataFunction | None", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 521, + "kind": 32768, + "kindString": "Parameter", + "name": "add_requests", + "type": { + "name": "AddRequestsFunction | None", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 522, + "kind": 32768, + "kindString": "Parameter", + "name": "get_key_value_store", + "type": { + "name": "GetKeyValueStoreFromRequestHandlerFunction | None", + "type": "reference" + } + } + ], + "type": { + "name": "Self", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.create_modified_copy", + "target": 518, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawlingContext.create_modified_copy", + "target": 518, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The pre navigation crawling context used by the `PlaywrightCrawler`.\n\nIt provides access to the `Page` object, before the navigation to the URL is performed." + } + ] + }, + "decorations": [ + { + "args": "(frozen=True)", + "name": "dataclass" + }, + { + "args": "('Crawling contexts')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 4276, + 4277, + 1783 + ], + "title": "Methods" + }, + { + "children": [ + 4271, + 1781, + 4274, + 1782, + 4275, + 1780, + 4269, + 4272, + 4267, + 4270, + 4268, + 4273 + ], + "title": "Properties" + } + ], + "id": 1779, + "module": "crawlers._playwright._playwright_pre_nav_crawling_context", + "name": "PlaywrightPreNavCrawlingContext", + "parsedDocstring": { + "text": "The pre navigation crawling context used by the `PlaywrightCrawler`.\n\nIt provides access to the `Page` object, before the navigation to the URL is performed." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 17 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "BasicCrawlingContext", + "target": "504", + "type": "reference" + } + ], + "extendedBy": [ + { + "name": "PlaywrightPostNavCrawlingContext", + "target": "1846", + "type": "reference" + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Scroll to the bottom of a page, handling loading of additional items." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1785, + "module": "crawlers._playwright._utils", + "name": "infinite_scroll", + "parsedDocstring": { + "text": "Scroll to the bottom of a page, handling loading of additional items." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_playwright/_utils.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 25 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Scroll to the bottom of a page, handling loading of additional items." + } + ] + }, + "flags": {}, + "id": 1786, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "infinite_scroll", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1787, + "kind": 32768, + "kindString": "Parameter", + "name": "page", + "type": { + "name": "Page", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Blocks network requests matching specified URL patterns.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1788, + "module": "crawlers._playwright._utils", + "name": "block_requests", + "parsedDocstring": { + "text": "Blocks network requests matching specified URL patterns.\n", + "args": { + "page": "Playwright Page object to block requests on.", + "url_patterns": "List of URL patterns to block. If None, uses default patterns.", + "extra_url_patterns": "Additional URL patterns to append to the main patterns list." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_playwright/_utils.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 81 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Blocks network requests matching specified URL patterns.\n" + } + ] + }, + "flags": {}, + "id": 1789, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "block_requests", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Playwright Page object to block requests on." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1790, + "kind": 32768, + "kindString": "Parameter", + "name": "page", + "type": { + "name": "Page", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "List of URL patterns to block. If None, uses default patterns." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1791, + "kind": 32768, + "kindString": "Parameter", + "name": "url_patterns", + "type": { + "name": "list[str] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Additional URL patterns to append to the main patterns list." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1792, + "kind": 32768, + "kindString": "Parameter", + "name": "extra_url_patterns", + "type": { + "name": "list[str] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Call dunder method.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1794, + "module": "crawlers._playwright._types", + "name": "__call__", + "parsedDocstring": { + "text": "Call dunder method.\n", + "args": { + "url_patterns": "List of URL patterns to block. If None, uses default patterns.", + "extra_url_patterns": "Additional URL patterns to append to the main patterns list." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_playwright/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 26 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Call dunder method.\n" + } + ] + }, + "flags": {}, + "id": 1795, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "__call__", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "List of URL patterns to block. If None, uses default patterns." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1796, + "kind": 32768, + "kindString": "Parameter", + "name": "url_patterns", + "type": { + "name": "list[str] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Additional URL patterns to append to the main patterns list." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1797, + "kind": 32768, + "kindString": "Parameter", + "name": "extra_url_patterns", + "type": { + "name": "list[str] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A function for blocking unwanted HTTP requests during page loads in PlaywrightCrawler.\n\nIt simplifies the process of blocking specific HTTP requests during page navigation.\nThe function allows blocking both default resource types (like images, fonts, stylesheets) and custom URL patterns." + } + ] + }, + "decorations": [ + { + "args": "('Functions')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 1794 + ], + "title": "Methods" + } + ], + "id": 1793, + "module": "crawlers._playwright._types", + "name": "BlockRequestsFunction", + "parsedDocstring": { + "text": "A function for blocking unwanted HTTP requests during page loads in PlaywrightCrawler.\n\nIt simplifies the process of blocking specific HTTP requests during page navigation.\nThe function allows blocking both default resource types (like images, fonts, stylesheets) and custom URL patterns." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_playwright/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 19 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1799, + "module": "crawlers._playwright._types", + "name": "http_version", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_playwright/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 41 + } + ], + "type": { + "name": "str", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1800, + "module": "crawlers._playwright._types", + "name": "status_code", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_playwright/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 42 + } + ], + "type": { + "name": "int", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1801, + "module": "crawlers._playwright._types", + "name": "headers", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_playwright/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 43 + } + ], + "type": { + "name": "HttpHeaders", + "type": "reference", + "target": "306" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1802, + "module": "crawlers._playwright._types", + "name": "read", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_playwright/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 46 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 1803, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "read", + "parameters": [], + "type": { + "name": "bytes", + "type": "reference", + "target": "682" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1804, + "module": "crawlers._playwright._types", + "name": "read_stream", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_playwright/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 49 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 1805, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "read_stream", + "parameters": [], + "type": { + "name": "AsyncGenerator", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bytes", + "target": "682" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "name": "classmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1806, + "module": "crawlers._playwright._types", + "name": "from_playwright_response", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_playwright/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 55 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 1807, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "from_playwright_response", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1808, + "kind": 32768, + "kindString": "Parameter", + "name": "response", + "type": { + "name": "Response | APIResponse", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Response" + }, + { + "type": "reference", + "name": "APIResponse" + } + ] + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1809, + "kind": 32768, + "kindString": "Parameter", + "name": "protocol", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "Self", + "type": "reference" + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Wrapper class for playwright `Response` and `APIResponse` objects to implement `HttpResponse` protocol." + } + ] + }, + "decorations": [ + { + "args": "(frozen=True)", + "name": "dataclass" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 1806, + 1802, + 1804 + ], + "title": "Methods" + }, + { + "children": [ + 1801, + 1799, + 1800 + ], + "title": "Properties" + } + ], + "id": 1798, + "module": "crawlers._playwright._types", + "name": "PlaywrightHttpResponse", + "parsedDocstring": { + "text": "Wrapper class for playwright `Response` and `APIResponse` objects to implement `HttpResponse` protocol." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_playwright/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 38 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "When to consider operation succeeded, defaults to 'load' event." + } + ] + }, + "flags": {}, + "groups": [], + "id": 1811, + "module": "crawlers._playwright._types", + "name": "wait_until", + "parsedDocstring": { + "text": "When to consider operation succeeded, defaults to 'load' event." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_playwright/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 71 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Literal", + "typeArguments": [ + { + "type": "literal", + "value": "domcontentloaded" + }, + { + "type": "literal", + "value": "load" + }, + { + "type": "literal", + "value": "networkidle" + }, + { + "type": "literal", + "value": "commit" + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Referer header value." + } + ] + }, + "flags": {}, + "groups": [], + "id": 1812, + "module": "crawlers._playwright._types", + "name": "referer", + "parsedDocstring": { + "text": "Referer header value." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_playwright/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 74 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Keyword arguments for Playwright's `Page.goto()` method." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 1812, + 1811 + ], + "title": "Properties" + } + ], + "id": 1810, + "module": "crawlers._playwright._types", + "name": "GotoOptions", + "parsedDocstring": { + "text": "Keyword arguments for Playwright's `Page.goto()` method." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_playwright/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 68 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1813, + "module": "crawlers._playwright._playwright_crawler", + "name": "TCrawlingContext", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_playwright/_playwright_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 38 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1814, + "module": "crawlers._playwright._playwright_crawler", + "name": "TStatisticsState", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_playwright/_playwright_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 39 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1816, + "module": "crawlers._playwright._playwright_crawler", + "name": "__init__", + "parsedDocstring": { + "text": "Initialize a new instance.\n", + "args": { + "browser_pool": "A `BrowserPool` instance to be used for launching the browsers and getting pages.", + "user_data_dir": "Path to a user data directory, which stores browser session data like cookies\nand local storage.", + "browser_type": "The type of browser to launch:\n- 'chromium', 'firefox', 'webkit': Use Playwright-managed browsers\n- 'chrome': Use your locally installed Google Chrome browser. Requires Google Chrome to be installed on\nthe system.\nThis option should not be used if `browser_pool` is provided.", + "browser_launch_options": "Keyword arguments to pass to the browser launch method. These options are provided\ndirectly to Playwright's `browser_type.launch` method. For more details, refer to the\n[Playwright documentation](https://playwright.dev/python/docs/api/class-browsertype#browser-type-launch).\nThis option should not be used if `browser_pool` is provided.", + "browser_new_context_options": "Keyword arguments to pass to the browser new context method. These options\nare provided directly to Playwright's `browser.new_context` method. For more details, refer to the\n[Playwright documentation](https://playwright.dev/python/docs/api/class-browser#browser-new-context).\nThis option should not be used if `browser_pool` is provided.", + "fingerprint_generator": "An optional instance of implementation of `FingerprintGenerator` that is used\nto generate browser fingerprints together with consistent headers.", + "headless": "Whether to run the browser in headless mode.\nThis option should not be used if `browser_pool` is provided.", + "use_incognito_pages": "By default pages share the same browser context. If set to True each page uses its\nown context that is destroyed once the page is closed or crashes.\nThis option should not be used if `browser_pool` is provided.", + "navigation_timeout": "Timeout for navigation (the process between opening a Playwright page and calling\nthe request handler)", + "goto_options": "Additional options to pass to Playwright's `Page.goto()` method. The `timeout` option is\nnot supported, use `navigation_timeout` instead.", + "kwargs": "Additional keyword arguments to pass to the underlying `BasicCrawler`." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_playwright/_playwright_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 101 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n" + } + ] + }, + "flags": {}, + "id": 1817, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A `BrowserPool` instance to be used for launching the browsers and getting pages." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1818, + "kind": 32768, + "kindString": "Parameter", + "name": "browser_pool", + "type": { + "name": "BrowserPool | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "BrowserPool", + "target": "1167" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The type of browser to launch:\n- 'chromium', 'firefox', 'webkit': Use Playwright-managed browsers\n- 'chrome': Use your locally installed Google Chrome browser. Requires Google Chrome to be installed on\nthe system.\nThis option should not be used if `browser_pool` is provided." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1819, + "kind": 32768, + "kindString": "Parameter", + "name": "browser_type", + "type": { + "name": "BrowserType | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "BrowserType", + "target": "1135" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Path to a user data directory, which stores browser session data like cookies\nand local storage." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1820, + "kind": 32768, + "kindString": "Parameter", + "name": "user_data_dir", + "type": { + "name": "str | Path | None", + "type": "union", + "types": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Path" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Keyword arguments to pass to the browser launch method. These options are provided\ndirectly to Playwright's `browser_type.launch` method. For more details, refer to the\n[Playwright documentation](https://playwright.dev/python/docs/api/class-browsertype#browser-type-launch).\nThis option should not be used if `browser_pool` is provided." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1821, + "kind": 32768, + "kindString": "Parameter", + "name": "browser_launch_options", + "type": { + "name": "Mapping[str, Any] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Mapping", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Any" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Keyword arguments to pass to the browser new context method. These options\nare provided directly to Playwright's `browser.new_context` method. For more details, refer to the\n[Playwright documentation](https://playwright.dev/python/docs/api/class-browser#browser-new-context).\nThis option should not be used if `browser_pool` is provided." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1822, + "kind": 32768, + "kindString": "Parameter", + "name": "browser_new_context_options", + "type": { + "name": "Mapping[str, Any] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Mapping", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Any" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Additional options to pass to Playwright's `Page.goto()` method. The `timeout` option is\nnot supported, use `navigation_timeout` instead." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1823, + "kind": 32768, + "kindString": "Parameter", + "name": "goto_options", + "type": { + "name": "GotoOptions | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "GotoOptions", + "target": "1810" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "An optional instance of implementation of `FingerprintGenerator` that is used\nto generate browser fingerprints together with consistent headers." + } + ] + }, + "defaultValue": "'default'", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1824, + "kind": 32768, + "kindString": "Parameter", + "name": "fingerprint_generator", + "type": { + "name": "FingerprintGenerator | None | Literal['default']", + "type": "union", + "types": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "FingerprintGenerator", + "target": "1979" + }, + { + "type": "literal", + "value": null + } + ] + }, + { + "type": "reference", + "name": "Literal", + "typeArguments": [ + { + "type": "literal", + "value": "default" + } + ] + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Whether to run the browser in headless mode.\nThis option should not be used if `browser_pool` is provided." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1825, + "kind": 32768, + "kindString": "Parameter", + "name": "headless", + "type": { + "name": "bool | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "bool" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "By default pages share the same browser context. If set to True each page uses its\nown context that is destroyed once the page is closed or crashes.\nThis option should not be used if `browser_pool` is provided." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1826, + "kind": 32768, + "kindString": "Parameter", + "name": "use_incognito_pages", + "type": { + "name": "bool | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "bool" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Timeout for navigation (the process between opening a Playwright page and calling\nthe request handler)" + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1827, + "kind": 32768, + "kindString": "Parameter", + "name": "navigation_timeout", + "type": { + "name": "timedelta | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "timedelta" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A callable responsible for handling requests." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4029, + "module": "crawlers._basic._basic_crawler", + "name": "request_handler", + "parsedDocstring": { + "text": "A callable responsible for handling requests." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 224 + } + ], + "type": { + "name": "NotRequired[Callable[[TCrawlingContext], Awaitable[None]]]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptionsGeneric.request_handler", + "target": 1552, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A custom `Statistics` instance, allowing the use of non-default configuration." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4030, + "module": "crawlers._basic._basic_crawler", + "name": "statistics", + "parsedDocstring": { + "text": "A custom `Statistics` instance, allowing the use of non-default configuration." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 231 + } + ], + "type": { + "name": "NotRequired[Statistics[TStatisticsState]]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptionsGeneric.statistics", + "target": 1553, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The `Configuration` instance. Some of its properties are used as defaults for the crawler." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4031, + "module": "crawlers._basic._basic_crawler", + "name": "configuration", + "parsedDocstring": { + "text": "The `Configuration` instance. Some of its properties are used as defaults for the crawler." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 116 + } + ], + "type": { + "name": "NotRequired[Configuration]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.configuration", + "target": 1526, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The event manager for managing events for the crawler and all its components." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4032, + "module": "crawlers._basic._basic_crawler", + "name": "event_manager", + "parsedDocstring": { + "text": "The event manager for managing events for the crawler and all its components." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 119 + } + ], + "type": { + "name": "NotRequired[EventManager]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.event_manager", + "target": 1527, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The storage client for managing storages for the crawler and all its components." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4033, + "module": "crawlers._basic._basic_crawler", + "name": "storage_client", + "parsedDocstring": { + "text": "The storage client for managing storages for the crawler and all its components." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 122 + } + ], + "type": { + "name": "NotRequired[StorageClient]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.storage_client", + "target": 1528, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Manager of requests that should be processed by the crawler." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4034, + "module": "crawlers._basic._basic_crawler", + "name": "request_manager", + "parsedDocstring": { + "text": "Manager of requests that should be processed by the crawler." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 125 + } + ], + "type": { + "name": "NotRequired[RequestManager]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.request_manager", + "target": 1529, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A custom `SessionPool` instance, allowing the use of non-default configuration." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4035, + "module": "crawlers._basic._basic_crawler", + "name": "session_pool", + "parsedDocstring": { + "text": "A custom `SessionPool` instance, allowing the use of non-default configuration." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 128 + } + ], + "type": { + "name": "NotRequired[SessionPool]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.session_pool", + "target": 1530, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP proxy configuration used when making requests." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4036, + "module": "crawlers._basic._basic_crawler", + "name": "proxy_configuration", + "parsedDocstring": { + "text": "HTTP proxy configuration used when making requests." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 131 + } + ], + "type": { + "name": "NotRequired[ProxyConfiguration]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.proxy_configuration", + "target": 1531, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP client used by `BasicCrawlingContext.send_request` method." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4037, + "module": "crawlers._basic._basic_crawler", + "name": "http_client", + "parsedDocstring": { + "text": "HTTP client used by `BasicCrawlingContext.send_request` method." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 134 + } + ], + "type": { + "name": "NotRequired[HttpClient]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.http_client", + "target": 1532, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Specifies the maximum number of retries allowed for a request if its processing fails.\nThis includes retries due to navigation errors or errors thrown from user-supplied functions\n(`request_handler`, `pre_navigation_hooks` etc.).\n\nThis limit does not apply to retries triggered by session rotation (see `max_session_rotations`)." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4038, + "module": "crawlers._basic._basic_crawler", + "name": "max_request_retries", + "parsedDocstring": { + "text": "Specifies the maximum number of retries allowed for a request if its processing fails.\nThis includes retries due to navigation errors or errors thrown from user-supplied functions\n(`request_handler`, `pre_navigation_hooks` etc.).\n\nThis limit does not apply to retries triggered by session rotation (see `max_session_rotations`)." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 137 + } + ], + "type": { + "name": "NotRequired[int]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.max_request_retries", + "target": 1533, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum number of pages to open during a crawl. The crawl stops upon reaching this limit.\nSetting this value can help avoid infinite loops in misconfigured crawlers. `None` means no limit.\nDue to concurrency settings, the actual number of pages visited may slightly exceed this value." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4039, + "module": "crawlers._basic._basic_crawler", + "name": "max_requests_per_crawl", + "parsedDocstring": { + "text": "Maximum number of pages to open during a crawl. The crawl stops upon reaching this limit.\nSetting this value can help avoid infinite loops in misconfigured crawlers. `None` means no limit.\nDue to concurrency settings, the actual number of pages visited may slightly exceed this value." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 144 + } + ], + "type": { + "name": "NotRequired[int | None]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.max_requests_per_crawl", + "target": 1534, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum number of session rotations per request. The crawler rotates the session if a proxy error occurs\nor if the website blocks the request.\n\nThe session rotations are not counted towards the `max_request_retries` limit." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4040, + "module": "crawlers._basic._basic_crawler", + "name": "max_session_rotations", + "parsedDocstring": { + "text": "Maximum number of session rotations per request. The crawler rotates the session if a proxy error occurs\nor if the website blocks the request.\n\nThe session rotations are not counted towards the `max_request_retries` limit." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 149 + } + ], + "type": { + "name": "NotRequired[int]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.max_session_rotations", + "target": 1535, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Specifies the maximum crawl depth. If set, the crawler will stop processing links beyond this depth.\nThe crawl depth starts at 0 for initial requests and increases with each subsequent level of links.\nRequests at the maximum depth will still be processed, but no new links will be enqueued from those requests.\nIf not set, crawling continues without depth restrictions." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4041, + "module": "crawlers._basic._basic_crawler", + "name": "max_crawl_depth", + "parsedDocstring": { + "text": "Specifies the maximum crawl depth. If set, the crawler will stop processing links beyond this depth.\nThe crawl depth starts at 0 for initial requests and increases with each subsequent level of links.\nRequests at the maximum depth will still be processed, but no new links will be enqueued from those requests.\nIf not set, crawling continues without depth restrictions." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 156 + } + ], + "type": { + "name": "NotRequired[int | None]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.max_crawl_depth", + "target": 1536, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Enable the use of a session pool for managing sessions during crawling." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4042, + "module": "crawlers._basic._basic_crawler", + "name": "use_session_pool", + "parsedDocstring": { + "text": "Enable the use of a session pool for managing sessions during crawling." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 163 + } + ], + "type": { + "name": "NotRequired[bool]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.use_session_pool", + "target": 1537, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, the crawler attempts to bypass bot protections automatically." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4043, + "module": "crawlers._basic._basic_crawler", + "name": "retry_on_blocked", + "parsedDocstring": { + "text": "If True, the crawler attempts to bypass bot protections automatically." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 166 + } + ], + "type": { + "name": "NotRequired[bool]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.retry_on_blocked", + "target": 1538, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Settings to fine-tune concurrency levels." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4044, + "module": "crawlers._basic._basic_crawler", + "name": "concurrency_settings", + "parsedDocstring": { + "text": "Settings to fine-tune concurrency levels." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 169 + } + ], + "type": { + "name": "NotRequired[ConcurrencySettings]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.concurrency_settings", + "target": 1539, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum duration allowed for a single request handler to run." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4045, + "module": "crawlers._basic._basic_crawler", + "name": "request_handler_timeout", + "parsedDocstring": { + "text": "Maximum duration allowed for a single request handler to run." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 172 + } + ], + "type": { + "name": "NotRequired[timedelta]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.request_handler_timeout", + "target": 1540, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, the crawler stops immediately when any request handler error occurs." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4046, + "module": "crawlers._basic._basic_crawler", + "name": "abort_on_error", + "parsedDocstring": { + "text": "If True, the crawler stops immediately when any request handler error occurs." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 175 + } + ], + "type": { + "name": "NotRequired[bool]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.abort_on_error", + "target": 1541, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, the crawler will set up logging infrastructure automatically." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4047, + "module": "crawlers._basic._basic_crawler", + "name": "configure_logging", + "parsedDocstring": { + "text": "If True, the crawler will set up logging infrastructure automatically." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 178 + } + ], + "type": { + "name": "NotRequired[bool]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.configure_logging", + "target": 1542, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If 'table', displays crawler statistics as formatted tables in logs. If 'inline', outputs statistics as plain\ntext log messages." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4048, + "module": "crawlers._basic._basic_crawler", + "name": "statistics_log_format", + "parsedDocstring": { + "text": "If 'table', displays crawler statistics as formatted tables in logs. If 'inline', outputs statistics as plain\ntext log messages." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 181 + } + ], + "type": { + "name": "NotRequired[Literal['table', 'inline']]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.statistics_log_format", + "target": 1543, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Flag that can keep crawler running even when there are no requests in queue." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4049, + "module": "crawlers._basic._basic_crawler", + "name": "keep_alive", + "parsedDocstring": { + "text": "Flag that can keep crawler running even when there are no requests in queue." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 186 + } + ], + "type": { + "name": "NotRequired[bool]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.keep_alive", + "target": 1544, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Additional HTTP status codes to treat as errors, triggering automatic retries when encountered." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4050, + "module": "crawlers._basic._basic_crawler", + "name": "additional_http_error_status_codes", + "parsedDocstring": { + "text": "Additional HTTP status codes to treat as errors, triggering automatic retries when encountered." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 189 + } + ], + "type": { + "name": "NotRequired[Iterable[int]]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.additional_http_error_status_codes", + "target": 1545, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP status codes that are typically considered errors but should be treated as successful responses." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4051, + "module": "crawlers._basic._basic_crawler", + "name": "ignore_http_error_status_codes", + "parsedDocstring": { + "text": "HTTP status codes that are typically considered errors but should be treated as successful responses." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 192 + } + ], + "type": { + "name": "NotRequired[Iterable[int]]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.ignore_http_error_status_codes", + "target": 1546, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If set to `True`, the crawler will automatically try to fetch the robots.txt file for each domain,\nand skip those that are not allowed. This also prevents disallowed URLs to be added via `EnqueueLinksFunction`." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4052, + "module": "crawlers._basic._basic_crawler", + "name": "respect_robots_txt_file", + "parsedDocstring": { + "text": "If set to `True`, the crawler will automatically try to fetch the robots.txt file for each domain,\nand skip those that are not allowed. This also prevents disallowed URLs to be added via `EnqueueLinksFunction`." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 203 + } + ], + "type": { + "name": "NotRequired[bool]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.respect_robots_txt_file", + "target": 1547, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Interval for logging the crawler status messages." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4053, + "module": "crawlers._basic._basic_crawler", + "name": "status_message_logging_interval", + "parsedDocstring": { + "text": "Interval for logging the crawler status messages." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 207 + } + ], + "type": { + "name": "NotRequired[timedelta]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.status_message_logging_interval", + "target": 1548, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Allows overriding the default status message. The default status message is provided in the parameters.\nReturning `None` suppresses the status message." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4054, + "module": "crawlers._basic._basic_crawler", + "name": "status_message_callback", + "parsedDocstring": { + "text": "Allows overriding the default status message. The default status message is provided in the parameters.\nReturning `None` suppresses the status message." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 210 + } + ], + "type": { + "name": "NotRequired[ Callable[[StatisticsState, StatisticsState | None, str], Awaitable[str | None]] ]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.status_message_callback", + "target": 1549, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Identifier used for crawler state tracking. Use the same id across multiple crawlers to share state between\nthem." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 4055, + "module": "crawlers._basic._basic_crawler", + "name": "id", + "parsedDocstring": { + "text": "Identifier used for crawler state tracking. Use the same id across multiple crawlers to share state between\nthem." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 216 + } + ], + "type": { + "name": "NotRequired[int]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.id", + "target": 1550, + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "BasicCrawler.__init__", + "target": 1556, + "type": "reference" + } + } + ], + "overwrites": { + "name": "BasicCrawler.__init__", + "target": 1556, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a hook to be called before each navigation.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1829, + "module": "crawlers._playwright._playwright_crawler", + "name": "pre_navigation_hook", + "parsedDocstring": { + "text": "Register a hook to be called before each navigation.\n", + "args": { + "hook": "A coroutine function to be called before each navigation." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_playwright/_playwright_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 528 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a hook to be called before each navigation.\n" + } + ] + }, + "flags": {}, + "id": 1830, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "pre_navigation_hook", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A coroutine function to be called before each navigation." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1831, + "kind": 32768, + "kindString": "Parameter", + "name": "hook", + "type": { + "name": "Callable", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "[PlaywrightPreNavCrawlingContext]" + }, + { + "type": "reference", + "name": "Awaitable", + "typeArguments": [ + { + "type": "literal", + "value": null + } + ] + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a hook to be called after each navigation.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1832, + "module": "crawlers._playwright._playwright_crawler", + "name": "post_navigation_hook", + "parsedDocstring": { + "text": "Register a hook to be called after each navigation.\n", + "args": { + "hook": "A coroutine function to be called after each navigation." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_playwright/_playwright_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 536 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a hook to be called after each navigation.\n" + } + ] + }, + "flags": {}, + "id": 1833, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "post_navigation_hook", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A coroutine function to be called after each navigation." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1834, + "kind": 32768, + "kindString": "Parameter", + "name": "hook", + "type": { + "name": "Callable", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "[PlaywrightPostNavCrawlingContext]" + }, + { + "type": "reference", + "name": "Awaitable", + "typeArguments": [ + { + "type": "literal", + "value": null + } + ] + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The logger used by the crawler." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 4120, + "module": "crawlers._basic._basic_crawler", + "name": "log", + "parsedDocstring": { + "text": "The logger used by the crawler." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 511 + } + ], + "type": { + "name": "logging.Logger", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.log", + "target": 1588, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The `Router` used to handle each individual crawling request." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 4121, + "module": "crawlers._basic._basic_crawler", + "name": "router", + "parsedDocstring": { + "text": "The `Router` used to handle each individual crawling request." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 516 + } + ], + "type": { + "name": "Router[TCrawlingContext]", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.router", + "target": 1589, + "type": "reference" + }, + "overwrites": { + "name": "BasicCrawler.router", + "target": 1590, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Statistics about the current (or last) crawler run." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 4122, + "module": "crawlers._basic._basic_crawler", + "name": "statistics", + "parsedDocstring": { + "text": "Statistics about the current (or last) crawler run." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 531 + } + ], + "type": { + "name": "Statistics[TStatisticsState]", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.statistics", + "target": 1593, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Set flag to stop crawler.\n\nThis stops current crawler run regardless of whether all requests were finished.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4123, + "module": "crawlers._basic._basic_crawler", + "name": "stop", + "parsedDocstring": { + "text": "Set flag to stop crawler.\n\nThis stops current crawler run regardless of whether all requests were finished.\n", + "args": { + "reason": "Reason for stopping that will be used in logs." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 535 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Set flag to stop crawler.\n\nThis stops current crawler run regardless of whether all requests were finished.\n" + } + ] + }, + "flags": {}, + "id": 1595, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "stop", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Reason for stopping that will be used in logs." + } + ] + }, + "defaultValue": "'Stop was called externally.'", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1596, + "kind": 32768, + "kindString": "Parameter", + "name": "reason", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.stop", + "target": 1594, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.stop", + "target": 1594, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the configured request manager. If none is configured, open and return the default request queue." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4124, + "module": "crawlers._basic._basic_crawler", + "name": "get_request_manager", + "parsedDocstring": { + "text": "Return the configured request manager. If none is configured, open and return the default request queue." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 613 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the configured request manager. If none is configured, open and return the default request queue." + } + ] + }, + "flags": {}, + "id": 1598, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_request_manager", + "parameters": [], + "type": { + "name": "RequestManager", + "type": "reference", + "target": "2339" + }, + "inheritedFrom": { + "name": "BasicCrawler.get_request_manager", + "target": 1597, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.get_request_manager", + "target": 1597, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the `Dataset` with the given ID or name. If none is provided, return the default one." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4125, + "module": "crawlers._basic._basic_crawler", + "name": "get_dataset", + "parsedDocstring": { + "text": "Return the `Dataset` with the given ID or name. If none is provided, return the default one." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 623 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the `Dataset` with the given ID or name. If none is provided, return the default one." + } + ] + }, + "flags": {}, + "id": 1600, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_dataset", + "parameters": [ + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1601, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1602, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1603, + "kind": 32768, + "kindString": "Parameter", + "name": "alias", + "type": { + "name": "str | None", + "type": "reference" + } + } + ], + "type": { + "name": "Dataset", + "type": "reference", + "target": "3766" + }, + "inheritedFrom": { + "name": "BasicCrawler.get_dataset", + "target": 1599, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.get_dataset", + "target": 1599, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the `KeyValueStore` with the given ID or name. If none is provided, return the default KVS." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4126, + "module": "crawlers._basic._basic_crawler", + "name": "get_key_value_store", + "parsedDocstring": { + "text": "Return the `KeyValueStore` with the given ID or name. If none is provided, return the default KVS." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 639 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the `KeyValueStore` with the given ID or name. If none is provided, return the default KVS." + } + ] + }, + "flags": {}, + "id": 1605, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_key_value_store", + "parameters": [ + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1606, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1607, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1608, + "kind": 32768, + "kindString": "Parameter", + "name": "alias", + "type": { + "name": "str | None", + "type": "reference" + } + } + ], + "type": { + "name": "KeyValueStore", + "type": "reference", + "target": "3700" + }, + "inheritedFrom": { + "name": "BasicCrawler.get_key_value_store", + "target": 1604, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.get_key_value_store", + "target": 1604, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a function to handle errors occurring in request handlers.\n\nThe error handler is invoked after a request handler error occurs and before a retry attempt." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4127, + "module": "crawlers._basic._basic_crawler", + "name": "error_handler", + "parsedDocstring": { + "text": "Register a function to handle errors occurring in request handlers.\n\nThe error handler is invoked after a request handler error occurs and before a retry attempt." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 655 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a function to handle errors occurring in request handlers.\n\nThe error handler is invoked after a request handler error occurs and before a retry attempt." + } + ] + }, + "flags": {}, + "id": 1610, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "error_handler", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1611, + "kind": 32768, + "kindString": "Parameter", + "name": "handler", + "type": { + "name": "ErrorHandler[TCrawlingContext | BasicCrawlingContext]", + "type": "reference" + } + } + ], + "type": { + "name": "ErrorHandler[TCrawlingContext]", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.error_handler", + "target": 1609, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.error_handler", + "target": 1609, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a function to handle requests that exceed the maximum retry limit.\n\nThe failed request handler is invoked when a request has failed all retry attempts." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4128, + "module": "crawlers._basic._basic_crawler", + "name": "failed_request_handler", + "parsedDocstring": { + "text": "Register a function to handle requests that exceed the maximum retry limit.\n\nThe failed request handler is invoked when a request has failed all retry attempts." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 665 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a function to handle requests that exceed the maximum retry limit.\n\nThe failed request handler is invoked when a request has failed all retry attempts." + } + ] + }, + "flags": {}, + "id": 1613, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "failed_request_handler", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1614, + "kind": 32768, + "kindString": "Parameter", + "name": "handler", + "type": { + "name": "FailedRequestHandler[TCrawlingContext | BasicCrawlingContext]", + "type": "reference" + } + } + ], + "type": { + "name": "FailedRequestHandler[TCrawlingContext]", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.failed_request_handler", + "target": 1612, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.failed_request_handler", + "target": 1612, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a function to handle skipped requests.\n\nThe skipped request handler is invoked when a request is skipped due to a collision or other reasons." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4129, + "module": "crawlers._basic._basic_crawler", + "name": "on_skipped_request", + "parsedDocstring": { + "text": "Register a function to handle skipped requests.\n\nThe skipped request handler is invoked when a request is skipped due to a collision or other reasons." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 675 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register a function to handle skipped requests.\n\nThe skipped request handler is invoked when a request is skipped due to a collision or other reasons." + } + ] + }, + "flags": {}, + "id": 1616, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "on_skipped_request", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1617, + "kind": 32768, + "kindString": "Parameter", + "name": "callback", + "type": { + "name": "SkippedRequestCallback", + "type": "reference", + "target": "1524" + } + } + ], + "type": { + "name": "SkippedRequestCallback", + "type": "reference", + "target": "1524" + }, + "inheritedFrom": { + "name": "BasicCrawler.on_skipped_request", + "target": 1615, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.on_skipped_request", + "target": 1615, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Run the crawler until all requests are processed.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4130, + "module": "crawlers._basic._basic_crawler", + "name": "run", + "parsedDocstring": { + "text": "Run the crawler until all requests are processed.\n", + "args": { + "requests": "The requests to be enqueued before the crawler starts.", + "purge_request_queue": "If this is `True` and the crawler is not being run for the first time, the default\nrequest queue will be purged." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 683 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Run the crawler until all requests are processed.\n" + } + ] + }, + "flags": {}, + "id": 1619, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "run", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The requests to be enqueued before the crawler starts." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1620, + "kind": 32768, + "kindString": "Parameter", + "name": "requests", + "type": { + "name": "Sequence[str | Request] | None", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "If this is `True` and the crawler is not being run for the first time, the default\nrequest queue will be purged." + } + ] + }, + "defaultValue": "True", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1621, + "kind": 32768, + "kindString": "Parameter", + "name": "purge_request_queue", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "FinalStatistics", + "type": "reference", + "target": "2649" + }, + "inheritedFrom": { + "name": "BasicCrawler.run", + "target": 1618, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.run", + "target": 1618, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Add requests to the underlying request manager in batches.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4131, + "module": "crawlers._basic._basic_crawler", + "name": "add_requests", + "parsedDocstring": { + "text": "Add requests to the underlying request manager in batches.\n", + "args": { + "requests": "A list of requests to add to the queue.", + "forefront": "If True, add requests to the forefront of the queue.", + "batch_size": "The number of requests to add in one batch.", + "wait_time_between_batches": "Time to wait between adding batches.", + "wait_for_all_requests_to_be_added": "If True, wait for all requests to be added before returning.", + "wait_for_all_requests_to_be_added_timeout": "Timeout for waiting for all requests to be added." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 795 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Add requests to the underlying request manager in batches.\n" + } + ] + }, + "flags": {}, + "id": 1623, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "add_requests", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A list of requests to add to the queue." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1624, + "kind": 32768, + "kindString": "Parameter", + "name": "requests", + "type": { + "name": "Sequence[str | Request]", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, add requests to the forefront of the queue." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1625, + "kind": 32768, + "kindString": "Parameter", + "name": "forefront", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The number of requests to add in one batch." + } + ] + }, + "defaultValue": "1000", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1626, + "kind": 32768, + "kindString": "Parameter", + "name": "batch_size", + "type": { + "name": "int", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Time to wait between adding batches." + } + ] + }, + "defaultValue": "timedelta(0)", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1627, + "kind": 32768, + "kindString": "Parameter", + "name": "wait_time_between_batches", + "type": { + "name": "timedelta", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, wait for all requests to be added before returning." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1628, + "kind": 32768, + "kindString": "Parameter", + "name": "wait_for_all_requests_to_be_added", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Timeout for waiting for all requests to be added." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1629, + "kind": 32768, + "kindString": "Parameter", + "name": "wait_for_all_requests_to_be_added_timeout", + "type": { + "name": "timedelta | None", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.add_requests", + "target": 1622, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.add_requests", + "target": 1622, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4132, + "module": "crawlers._basic._basic_crawler", + "name": "use_state", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 843 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 1631, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "use_state", + "parameters": [ + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1632, + "kind": 32768, + "kindString": "Parameter", + "name": "default_value", + "type": { + "name": "dict[str, JsonSerializable] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "dict", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "JsonSerializable" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "dict", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "JsonSerializable" + } + ] + }, + "inheritedFrom": { + "name": "BasicCrawler.use_state", + "target": 1630, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.use_state", + "target": 1630, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Retrieve data from a `Dataset`.\n\nThis helper method simplifies the process of retrieving data from a `Dataset`. It opens the specified\none and then retrieves the data based on the provided parameters.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4133, + "module": "crawlers._basic._basic_crawler", + "name": "get_data", + "parsedDocstring": { + "text": "Retrieve data from a `Dataset`.\n\nThis helper method simplifies the process of retrieving data from a `Dataset`. It opens the specified\none and then retrieves the data based on the provided parameters.\n", + "args": { + "dataset_id": "The ID of the `Dataset`.", + "dataset_name": "The name of the `Dataset` (global scope, named storage).", + "dataset_alias": "The alias of the `Dataset` (run scope, unnamed storage).", + "kwargs": "Keyword arguments to be passed to the `Dataset.get_data()` method.\n" + }, + "returns": "The retrieved data." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 854 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "The retrieved data." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Retrieve data from a `Dataset`.\n\nThis helper method simplifies the process of retrieving data from a `Dataset`. It opens the specified\none and then retrieves the data based on the provided parameters.\n" + } + ] + }, + "flags": {}, + "id": 1634, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_data", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The ID of the `Dataset`." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1635, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_id", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The name of the `Dataset` (global scope, named storage)." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1636, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_name", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The alias of the `Dataset` (run scope, unnamed storage)." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1637, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_alias", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Keyword arguments to be passed to the `Dataset.get_data()` method.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1638, + "kind": 32768, + "kindString": "Parameter", + "name": "kwargs", + "type": { + "name": "Unpack[GetDataKwargs]", + "type": "reference" + } + } + ], + "type": { + "name": "DatasetItemsListPage", + "type": "reference", + "target": "3658" + }, + "inheritedFrom": { + "name": "BasicCrawler.get_data", + "target": 1633, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.get_data", + "target": 1633, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Export all items from a Dataset to a JSON or CSV file.\n\nThis method simplifies the process of exporting data collected during crawling. It automatically\ndetermines the export format based on the file extension (`.json` or `.csv`) and handles\nthe conversion of `Dataset` items to the appropriate format.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4134, + "module": "crawlers._basic._basic_crawler", + "name": "export_data", + "parsedDocstring": { + "text": "Export all items from a Dataset to a JSON or CSV file.\n\nThis method simplifies the process of exporting data collected during crawling. It automatically\ndetermines the export format based on the file extension (`.json` or `.csv`) and handles\nthe conversion of `Dataset` items to the appropriate format.\n", + "args": { + "path": "The destination file path. Must end with '.json' or '.csv'.", + "dataset_id": "The ID of the Dataset to export from.", + "dataset_name": "The name of the Dataset to export from (global scope, named storage).", + "dataset_alias": "The alias of the Dataset to export from (run scope, unnamed storage).", + "additional_kwargs": "Extra keyword arguments forwarded to the JSON/CSV exporter depending on the file format." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 884 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Export all items from a Dataset to a JSON or CSV file.\n\nThis method simplifies the process of exporting data collected during crawling. It automatically\ndetermines the export format based on the file extension (`.json` or `.csv`) and handles\nthe conversion of `Dataset` items to the appropriate format.\n" + } + ] + }, + "flags": {}, + "id": 1640, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "export_data", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The destination file path. Must end with '.json' or '.csv'." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1641, + "kind": 32768, + "kindString": "Parameter", + "name": "path", + "type": { + "name": "str | Path", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The ID of the Dataset to export from." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1642, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_id", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The name of the Dataset to export from (global scope, named storage)." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1643, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_name", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The alias of the Dataset to export from (run scope, unnamed storage)." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1644, + "kind": 32768, + "kindString": "Parameter", + "name": "dataset_alias", + "type": { + "name": "str | None", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Extra keyword arguments forwarded to the JSON/CSV exporter depending on the file format." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1645, + "kind": 32768, + "kindString": "Parameter", + "name": "additional_kwargs", + "type": { + "name": "Unpack[ExportDataJsonKwargs | ExportDataCsvKwargs]", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawler.export_data", + "target": 1639, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawler.export_data", + "target": 1639, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A web crawler that leverages the `Playwright` browser automation library.\n\nThe `PlaywrightCrawler` builds on top of the `BasicCrawler`, which means it inherits all of its features.\nOn top of that it provides a high level web crawling interface on top of the `Playwright` library. To be more\nspecific, it uses the Crawlee's `BrowserPool` to manage the Playwright's browser instances and the pages they\nopen. You can create your own `BrowserPool` instance and pass it to the `PlaywrightCrawler` constructor, or let\nthe crawler create a new instance with the default settings.\n\nThis crawler is ideal for crawling websites that require JavaScript execution, as it uses real browsers\nto download web pages and extract data. For websites that do not require JavaScript, consider using one of the\nHTTP client-based crawlers, such as the `HttpCrawler`, `ParselCrawler`, or `BeautifulSoupCrawler`. They use\nraw HTTP requests, which means they are much faster.\n\n### Usage\n\n```python\nfrom crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext\n\ncrawler = PlaywrightCrawler()\n\n# Define the default request handler, which will be called for every request.\n@crawler.router.default_handler\nasync def request_handler(context: PlaywrightCrawlingContext) -> None:\n context.log.info(f'Processing {context.request.url} ...')\n\n # Extract data from the page.\n data = {\n 'url': context.request.url,\n 'title': await context.page.title(),\n 'response': (await context.response.text())[:100],\n }\n\n # Push the extracted data to the default dataset.\n await context.push_data(data)\n\nawait crawler.run(['https://crawlee.dev/'])\n```" + } + ] + }, + "decorations": [ + { + "args": "('Crawlers')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 1816, + 4131, + 4127, + 4134, + 4128, + 4133, + 4125, + 4126, + 4124, + 4129, + 1832, + 1829, + 4130, + 4123, + 4132 + ], + "title": "Methods" + }, + { + "children": [ + 4120, + 4121, + 4122 + ], + "title": "Properties" + } + ], + "id": 1815, + "module": "crawlers._playwright._playwright_crawler", + "name": "PlaywrightCrawler", + "parsedDocstring": { + "text": "A web crawler that leverages the `Playwright` browser automation library.\n\nThe `PlaywrightCrawler` builds on top of the `BasicCrawler`, which means it inherits all of its features.\nOn top of that it provides a high level web crawling interface on top of the `Playwright` library. To be more\nspecific, it uses the Crawlee's `BrowserPool` to manage the Playwright's browser instances and the pages they\nopen. You can create your own `BrowserPool` instance and pass it to the `PlaywrightCrawler` constructor, or let\nthe crawler create a new instance with the default settings.\n\nThis crawler is ideal for crawling websites that require JavaScript execution, as it uses real browsers\nto download web pages and extract data. For websites that do not require JavaScript, consider using one of the\nHTTP client-based crawlers, such as the `HttpCrawler`, `ParselCrawler`, or `BeautifulSoupCrawler`. They use\nraw HTTP requests, which means they are much faster.\n\n### Usage\n\n```python\nfrom crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext\n\ncrawler = PlaywrightCrawler()\n\n# Define the default request handler, which will be called for every request.\n@crawler.router.default_handler\nasync def request_handler(context: PlaywrightCrawlingContext) -> None:\n context.log.info(f'Processing {context.request.url} ...')\n\n # Extract data from the page.\n data = {\n 'url': context.request.url,\n 'title': await context.page.title(),\n 'response': (await context.response.text())[:100],\n }\n\n # Push the extracted data to the default dataset.\n await context.push_data(data)\n\nawait crawler.run(['https://crawlee.dev/'])\n```" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_playwright/_playwright_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 61 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "BasicCrawler", + "target": "1555", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A `BrowserPool` instance to be used for launching the browsers and getting pages." + } + ] + }, + "flags": {}, + "groups": [], + "id": 1836, + "module": "crawlers._playwright._playwright_crawler", + "name": "browser_pool", + "parsedDocstring": { + "text": "A `BrowserPool` instance to be used for launching the browsers and getting pages." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_playwright/_playwright_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 571 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "BrowserPool", + "target": "1167" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The type of browser to launch:\n- 'chromium', 'firefox', 'webkit': Use Playwright-managed browsers\n- 'chrome': Use your locally installed Google Chrome browser. Requires Google Chrome to be installed on the system.\nThis option should not be used if `browser_pool` is provided." + } + ] + }, + "flags": {}, + "groups": [], + "id": 1837, + "module": "crawlers._playwright._playwright_crawler", + "name": "browser_type", + "parsedDocstring": { + "text": "The type of browser to launch:\n- 'chromium', 'firefox', 'webkit': Use Playwright-managed browsers\n- 'chrome': Use your locally installed Google Chrome browser. Requires Google Chrome to be installed on the system.\nThis option should not be used if `browser_pool` is provided." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_playwright/_playwright_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 574 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "BrowserType", + "target": "1135" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Keyword arguments to pass to the browser launch method. These options are provided\ndirectly to Playwright's `browser_type.launch` method. For more details, refer to the Playwright\ndocumentation: https://playwright.dev/python/docs/api/class-browsertype#browser-type-launch.\nThis option should not be used if `browser_pool` is provided." + } + ] + }, + "flags": {}, + "groups": [], + "id": 1838, + "module": "crawlers._playwright._playwright_crawler", + "name": "browser_launch_options", + "parsedDocstring": { + "text": "Keyword arguments to pass to the browser launch method. These options are provided\ndirectly to Playwright's `browser_type.launch` method. For more details, refer to the Playwright\ndocumentation: https://playwright.dev/python/docs/api/class-browsertype#browser-type-launch.\nThis option should not be used if `browser_pool` is provided." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_playwright/_playwright_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 580 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Mapping", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Any" + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Keyword arguments to pass to the browser new context method. These options are provided directly to Playwright's\n`browser.new_context` method. For more details, refer to the Playwright documentation:\nhttps://playwright.dev/python/docs/api/class-browser#browser-new-context. This option should not be used if\n`browser_pool` is provided." + } + ] + }, + "flags": {}, + "groups": [], + "id": 1839, + "module": "crawlers._playwright._playwright_crawler", + "name": "browser_new_context_options", + "parsedDocstring": { + "text": "Keyword arguments to pass to the browser new context method. These options are provided directly to Playwright's\n`browser.new_context` method. For more details, refer to the Playwright documentation:\nhttps://playwright.dev/python/docs/api/class-browser#browser-new-context. This option should not be used if\n`browser_pool` is provided." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_playwright/_playwright_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 586 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Mapping", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Any" + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Whether to run the browser in headless mode. This option should not be used if `browser_pool` is provided." + } + ] + }, + "flags": {}, + "groups": [], + "id": 1840, + "module": "crawlers._playwright._playwright_crawler", + "name": "headless", + "parsedDocstring": { + "text": "Whether to run the browser in headless mode. This option should not be used if `browser_pool` is provided." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_playwright/_playwright_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 592 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Additional arguments for the `PlaywrightCrawler` constructor.\n\nIt is intended for typing forwarded `__init__` arguments in the subclasses.\nAll arguments are `BasicCrawlerOptions` + `_PlaywrightCrawlerAdditionalOptions`" + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 1838, + 1839, + 1836, + 1837, + 1840 + ], + "title": "Properties" + } + ], + "id": 1835, + "module": "crawlers._playwright._playwright_crawler", + "name": "_PlaywrightCrawlerAdditionalOptions", + "parsedDocstring": { + "text": "Additional arguments for the `PlaywrightCrawler` constructor.\n\nIt is intended for typing forwarded `__init__` arguments in the subclasses.\nAll arguments are `BasicCrawlerOptions` + `_PlaywrightCrawlerAdditionalOptions`" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_playwright/_playwright_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 564 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedBy": [ + { + "name": "PlaywrightCrawlerOptions", + "target": "1841", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A `BrowserPool` instance to be used for launching the browsers and getting pages." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4010, + "module": "crawlers._playwright._playwright_crawler", + "name": "browser_pool", + "parsedDocstring": { + "text": "A `BrowserPool` instance to be used for launching the browsers and getting pages." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_playwright/_playwright_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 571 + } + ], + "type": { + "name": "NotRequired[BrowserPool]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_PlaywrightCrawlerAdditionalOptions.browser_pool", + "target": 1836, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The type of browser to launch:\n- 'chromium', 'firefox', 'webkit': Use Playwright-managed browsers\n- 'chrome': Use your locally installed Google Chrome browser. Requires Google Chrome to be installed on the system.\nThis option should not be used if `browser_pool` is provided." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4011, + "module": "crawlers._playwright._playwright_crawler", + "name": "browser_type", + "parsedDocstring": { + "text": "The type of browser to launch:\n- 'chromium', 'firefox', 'webkit': Use Playwright-managed browsers\n- 'chrome': Use your locally installed Google Chrome browser. Requires Google Chrome to be installed on the system.\nThis option should not be used if `browser_pool` is provided." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_playwright/_playwright_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 574 + } + ], + "type": { + "name": "NotRequired[BrowserType]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_PlaywrightCrawlerAdditionalOptions.browser_type", + "target": 1837, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Keyword arguments to pass to the browser launch method. These options are provided\ndirectly to Playwright's `browser_type.launch` method. For more details, refer to the Playwright\ndocumentation: https://playwright.dev/python/docs/api/class-browsertype#browser-type-launch.\nThis option should not be used if `browser_pool` is provided." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4012, + "module": "crawlers._playwright._playwright_crawler", + "name": "browser_launch_options", + "parsedDocstring": { + "text": "Keyword arguments to pass to the browser launch method. These options are provided\ndirectly to Playwright's `browser_type.launch` method. For more details, refer to the Playwright\ndocumentation: https://playwright.dev/python/docs/api/class-browsertype#browser-type-launch.\nThis option should not be used if `browser_pool` is provided." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_playwright/_playwright_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 580 + } + ], + "type": { + "name": "NotRequired[Mapping[str, Any]]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_PlaywrightCrawlerAdditionalOptions.browser_launch_options", + "target": 1838, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Keyword arguments to pass to the browser new context method. These options are provided directly to Playwright's\n`browser.new_context` method. For more details, refer to the Playwright documentation:\nhttps://playwright.dev/python/docs/api/class-browser#browser-new-context. This option should not be used if\n`browser_pool` is provided." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4013, + "module": "crawlers._playwright._playwright_crawler", + "name": "browser_new_context_options", + "parsedDocstring": { + "text": "Keyword arguments to pass to the browser new context method. These options are provided directly to Playwright's\n`browser.new_context` method. For more details, refer to the Playwright documentation:\nhttps://playwright.dev/python/docs/api/class-browser#browser-new-context. This option should not be used if\n`browser_pool` is provided." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_playwright/_playwright_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 586 + } + ], + "type": { + "name": "NotRequired[Mapping[str, Any]]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_PlaywrightCrawlerAdditionalOptions.browser_new_context_options", + "target": 1839, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Whether to run the browser in headless mode. This option should not be used if `browser_pool` is provided." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4014, + "module": "crawlers._playwright._playwright_crawler", + "name": "headless", + "parsedDocstring": { + "text": "Whether to run the browser in headless mode. This option should not be used if `browser_pool` is provided." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_playwright/_playwright_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 592 + } + ], + "type": { + "name": "NotRequired[bool]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_PlaywrightCrawlerAdditionalOptions.headless", + "target": 1840, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A callable responsible for handling requests." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4216, + "module": "crawlers._basic._basic_crawler", + "name": "request_handler", + "parsedDocstring": { + "text": "A callable responsible for handling requests." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 224 + } + ], + "type": { + "name": "NotRequired[Callable[[TCrawlingContext], Awaitable[None]]]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptionsGeneric.request_handler", + "target": 1552, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A custom `Statistics` instance, allowing the use of non-default configuration." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4217, + "module": "crawlers._basic._basic_crawler", + "name": "statistics", + "parsedDocstring": { + "text": "A custom `Statistics` instance, allowing the use of non-default configuration." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 231 + } + ], + "type": { + "name": "NotRequired[Statistics[TStatisticsState]]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptionsGeneric.statistics", + "target": 1553, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The `Configuration` instance. Some of its properties are used as defaults for the crawler." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4218, + "module": "crawlers._basic._basic_crawler", + "name": "configuration", + "parsedDocstring": { + "text": "The `Configuration` instance. Some of its properties are used as defaults for the crawler." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 116 + } + ], + "type": { + "name": "NotRequired[Configuration]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.configuration", + "target": 1526, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The event manager for managing events for the crawler and all its components." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4219, + "module": "crawlers._basic._basic_crawler", + "name": "event_manager", + "parsedDocstring": { + "text": "The event manager for managing events for the crawler and all its components." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 119 + } + ], + "type": { + "name": "NotRequired[EventManager]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.event_manager", + "target": 1527, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The storage client for managing storages for the crawler and all its components." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4220, + "module": "crawlers._basic._basic_crawler", + "name": "storage_client", + "parsedDocstring": { + "text": "The storage client for managing storages for the crawler and all its components." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 122 + } + ], + "type": { + "name": "NotRequired[StorageClient]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.storage_client", + "target": 1528, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Manager of requests that should be processed by the crawler." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4221, + "module": "crawlers._basic._basic_crawler", + "name": "request_manager", + "parsedDocstring": { + "text": "Manager of requests that should be processed by the crawler." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 125 + } + ], + "type": { + "name": "NotRequired[RequestManager]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.request_manager", + "target": 1529, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A custom `SessionPool` instance, allowing the use of non-default configuration." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4222, + "module": "crawlers._basic._basic_crawler", + "name": "session_pool", + "parsedDocstring": { + "text": "A custom `SessionPool` instance, allowing the use of non-default configuration." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 128 + } + ], + "type": { + "name": "NotRequired[SessionPool]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.session_pool", + "target": 1530, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP proxy configuration used when making requests." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4223, + "module": "crawlers._basic._basic_crawler", + "name": "proxy_configuration", + "parsedDocstring": { + "text": "HTTP proxy configuration used when making requests." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 131 + } + ], + "type": { + "name": "NotRequired[ProxyConfiguration]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.proxy_configuration", + "target": 1531, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP client used by `BasicCrawlingContext.send_request` method." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4224, + "module": "crawlers._basic._basic_crawler", + "name": "http_client", + "parsedDocstring": { + "text": "HTTP client used by `BasicCrawlingContext.send_request` method." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 134 + } + ], + "type": { + "name": "NotRequired[HttpClient]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.http_client", + "target": 1532, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Specifies the maximum number of retries allowed for a request if its processing fails.\nThis includes retries due to navigation errors or errors thrown from user-supplied functions\n(`request_handler`, `pre_navigation_hooks` etc.).\n\nThis limit does not apply to retries triggered by session rotation (see `max_session_rotations`)." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4225, + "module": "crawlers._basic._basic_crawler", + "name": "max_request_retries", + "parsedDocstring": { + "text": "Specifies the maximum number of retries allowed for a request if its processing fails.\nThis includes retries due to navigation errors or errors thrown from user-supplied functions\n(`request_handler`, `pre_navigation_hooks` etc.).\n\nThis limit does not apply to retries triggered by session rotation (see `max_session_rotations`)." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 137 + } + ], + "type": { + "name": "NotRequired[int]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.max_request_retries", + "target": 1533, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum number of pages to open during a crawl. The crawl stops upon reaching this limit.\nSetting this value can help avoid infinite loops in misconfigured crawlers. `None` means no limit.\nDue to concurrency settings, the actual number of pages visited may slightly exceed this value." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4226, + "module": "crawlers._basic._basic_crawler", + "name": "max_requests_per_crawl", + "parsedDocstring": { + "text": "Maximum number of pages to open during a crawl. The crawl stops upon reaching this limit.\nSetting this value can help avoid infinite loops in misconfigured crawlers. `None` means no limit.\nDue to concurrency settings, the actual number of pages visited may slightly exceed this value." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 144 + } + ], + "type": { + "name": "NotRequired[int | None]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.max_requests_per_crawl", + "target": 1534, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum number of session rotations per request. The crawler rotates the session if a proxy error occurs\nor if the website blocks the request.\n\nThe session rotations are not counted towards the `max_request_retries` limit." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4227, + "module": "crawlers._basic._basic_crawler", + "name": "max_session_rotations", + "parsedDocstring": { + "text": "Maximum number of session rotations per request. The crawler rotates the session if a proxy error occurs\nor if the website blocks the request.\n\nThe session rotations are not counted towards the `max_request_retries` limit." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 149 + } + ], + "type": { + "name": "NotRequired[int]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.max_session_rotations", + "target": 1535, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Specifies the maximum crawl depth. If set, the crawler will stop processing links beyond this depth.\nThe crawl depth starts at 0 for initial requests and increases with each subsequent level of links.\nRequests at the maximum depth will still be processed, but no new links will be enqueued from those requests.\nIf not set, crawling continues without depth restrictions." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4228, + "module": "crawlers._basic._basic_crawler", + "name": "max_crawl_depth", + "parsedDocstring": { + "text": "Specifies the maximum crawl depth. If set, the crawler will stop processing links beyond this depth.\nThe crawl depth starts at 0 for initial requests and increases with each subsequent level of links.\nRequests at the maximum depth will still be processed, but no new links will be enqueued from those requests.\nIf not set, crawling continues without depth restrictions." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 156 + } + ], + "type": { + "name": "NotRequired[int | None]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.max_crawl_depth", + "target": 1536, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Enable the use of a session pool for managing sessions during crawling." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4229, + "module": "crawlers._basic._basic_crawler", + "name": "use_session_pool", + "parsedDocstring": { + "text": "Enable the use of a session pool for managing sessions during crawling." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 163 + } + ], + "type": { + "name": "NotRequired[bool]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.use_session_pool", + "target": 1537, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, the crawler attempts to bypass bot protections automatically." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4230, + "module": "crawlers._basic._basic_crawler", + "name": "retry_on_blocked", + "parsedDocstring": { + "text": "If True, the crawler attempts to bypass bot protections automatically." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 166 + } + ], + "type": { + "name": "NotRequired[bool]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.retry_on_blocked", + "target": 1538, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Settings to fine-tune concurrency levels." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4231, + "module": "crawlers._basic._basic_crawler", + "name": "concurrency_settings", + "parsedDocstring": { + "text": "Settings to fine-tune concurrency levels." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 169 + } + ], + "type": { + "name": "NotRequired[ConcurrencySettings]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.concurrency_settings", + "target": 1539, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum duration allowed for a single request handler to run." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4232, + "module": "crawlers._basic._basic_crawler", + "name": "request_handler_timeout", + "parsedDocstring": { + "text": "Maximum duration allowed for a single request handler to run." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 172 + } + ], + "type": { + "name": "NotRequired[timedelta]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.request_handler_timeout", + "target": 1540, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, the crawler stops immediately when any request handler error occurs." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4233, + "module": "crawlers._basic._basic_crawler", + "name": "abort_on_error", + "parsedDocstring": { + "text": "If True, the crawler stops immediately when any request handler error occurs." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 175 + } + ], + "type": { + "name": "NotRequired[bool]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.abort_on_error", + "target": 1541, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, the crawler will set up logging infrastructure automatically." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4234, + "module": "crawlers._basic._basic_crawler", + "name": "configure_logging", + "parsedDocstring": { + "text": "If True, the crawler will set up logging infrastructure automatically." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 178 + } + ], + "type": { + "name": "NotRequired[bool]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.configure_logging", + "target": 1542, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If 'table', displays crawler statistics as formatted tables in logs. If 'inline', outputs statistics as plain\ntext log messages." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4235, + "module": "crawlers._basic._basic_crawler", + "name": "statistics_log_format", + "parsedDocstring": { + "text": "If 'table', displays crawler statistics as formatted tables in logs. If 'inline', outputs statistics as plain\ntext log messages." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 181 + } + ], + "type": { + "name": "NotRequired[Literal['table', 'inline']]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.statistics_log_format", + "target": 1543, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Flag that can keep crawler running even when there are no requests in queue." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4236, + "module": "crawlers._basic._basic_crawler", + "name": "keep_alive", + "parsedDocstring": { + "text": "Flag that can keep crawler running even when there are no requests in queue." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 186 + } + ], + "type": { + "name": "NotRequired[bool]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.keep_alive", + "target": 1544, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Additional HTTP status codes to treat as errors, triggering automatic retries when encountered." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4237, + "module": "crawlers._basic._basic_crawler", + "name": "additional_http_error_status_codes", + "parsedDocstring": { + "text": "Additional HTTP status codes to treat as errors, triggering automatic retries when encountered." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 189 + } + ], + "type": { + "name": "NotRequired[Iterable[int]]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.additional_http_error_status_codes", + "target": 1545, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP status codes that are typically considered errors but should be treated as successful responses." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4238, + "module": "crawlers._basic._basic_crawler", + "name": "ignore_http_error_status_codes", + "parsedDocstring": { + "text": "HTTP status codes that are typically considered errors but should be treated as successful responses." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 192 + } + ], + "type": { + "name": "NotRequired[Iterable[int]]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.ignore_http_error_status_codes", + "target": 1546, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If set to `True`, the crawler will automatically try to fetch the robots.txt file for each domain,\nand skip those that are not allowed. This also prevents disallowed URLs to be added via `EnqueueLinksFunction`." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4239, + "module": "crawlers._basic._basic_crawler", + "name": "respect_robots_txt_file", + "parsedDocstring": { + "text": "If set to `True`, the crawler will automatically try to fetch the robots.txt file for each domain,\nand skip those that are not allowed. This also prevents disallowed URLs to be added via `EnqueueLinksFunction`." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 203 + } + ], + "type": { + "name": "NotRequired[bool]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.respect_robots_txt_file", + "target": 1547, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Interval for logging the crawler status messages." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4240, + "module": "crawlers._basic._basic_crawler", + "name": "status_message_logging_interval", + "parsedDocstring": { + "text": "Interval for logging the crawler status messages." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 207 + } + ], + "type": { + "name": "NotRequired[timedelta]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.status_message_logging_interval", + "target": 1548, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Allows overriding the default status message. The default status message is provided in the parameters.\nReturning `None` suppresses the status message." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4241, + "module": "crawlers._basic._basic_crawler", + "name": "status_message_callback", + "parsedDocstring": { + "text": "Allows overriding the default status message. The default status message is provided in the parameters.\nReturning `None` suppresses the status message." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 210 + } + ], + "type": { + "name": "NotRequired[ Callable[[StatisticsState, StatisticsState | None, str], Awaitable[str | None]] ]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.status_message_callback", + "target": 1549, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Identifier used for crawler state tracking. Use the same id across multiple crawlers to share state between\nthem." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4242, + "module": "crawlers._basic._basic_crawler", + "name": "id", + "parsedDocstring": { + "text": "Identifier used for crawler state tracking. Use the same id across multiple crawlers to share state between\nthem." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 216 + } + ], + "type": { + "name": "NotRequired[int]", + "type": "reference" + }, + "inheritedFrom": { + "name": "_BasicCrawlerOptions.id", + "target": 1550, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Arguments for the `AbstractHttpCrawler` constructor.\n\nIt is intended for typing forwarded `__init__` arguments in the subclasses." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 4233, + 4237, + 4012, + 4013, + 4010, + 4011, + 4231, + 4218, + 4234, + 4219, + 4014, + 4224, + 4242, + 4238, + 4236, + 4228, + 4225, + 4226, + 4227, + 4223, + 4216, + 4232, + 4221, + 4239, + 4230, + 4222, + 4217, + 4235, + 4241, + 4240, + 4220, + 4229 + ], + "title": "Properties" + } + ], + "id": 1841, + "module": "crawlers._playwright._playwright_crawler", + "name": "PlaywrightCrawlerOptions", + "parsedDocstring": { + "text": "Arguments for the `AbstractHttpCrawler` constructor.\n\nIt is intended for typing forwarded `__init__` arguments in the subclasses." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_playwright/_playwright_crawler.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 596 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "_PlaywrightCrawlerAdditionalOptions", + "target": "1835", + "type": "reference" + }, + { + "name": "BasicCrawlerOptions", + "target": "1554", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The Playwright `EnqueueLinksFunction` implementation." + } + ] + }, + "flags": {}, + "groups": [], + "id": 1843, + "module": "crawlers._playwright._playwright_crawling_context", + "name": "enqueue_links", + "parsedDocstring": { + "text": "The Playwright `EnqueueLinksFunction` implementation." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_playwright/_playwright_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 24 + } + ], + "type": { + "name": "EnqueueLinksFunction", + "type": "reference", + "target": "430" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The Playwright `ExtractLinksFunction` implementation." + } + ] + }, + "flags": {}, + "groups": [], + "id": 1844, + "module": "crawlers._playwright._playwright_crawling_context", + "name": "extract_links", + "parsedDocstring": { + "text": "The Playwright `ExtractLinksFunction` implementation." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_playwright/_playwright_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 27 + } + ], + "type": { + "name": "ExtractLinksFunction", + "type": "reference", + "target": "459" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A function to perform infinite scrolling on the page. This scrolls to the bottom, triggering\nthe loading of additional content if present." + } + ] + }, + "flags": {}, + "groups": [], + "id": 1845, + "module": "crawlers._playwright._playwright_crawling_context", + "name": "infinite_scroll", + "parsedDocstring": { + "text": "A function to perform infinite scrolling on the page. This scrolls to the bottom, triggering\nthe loading of additional content if present." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_playwright/_playwright_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 30 + } + ], + "type": { + "name": "Callable", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "[]" + }, + { + "type": "reference", + "name": "Awaitable", + "typeArguments": [ + { + "type": "literal", + "value": null + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The Playwright `Response` object containing the response details for the current URL." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4293, + "module": "crawlers._playwright._playwright_post_nav_crawling_context", + "name": "response", + "parsedDocstring": { + "text": "The Playwright `Response` object containing the response details for the current URL." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_playwright/_playwright_post_nav_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 22 + } + ], + "type": { + "name": "Response", + "type": "reference" + }, + "inheritedFrom": { + "name": "PlaywrightPostNavCrawlingContext.response", + "target": 1847, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The Playwright `Page` object for the current page." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4294, + "module": "crawlers._playwright._playwright_pre_nav_crawling_context", + "name": "page", + "parsedDocstring": { + "text": "The Playwright `Page` object for the current page." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 23 + } + ], + "type": { + "name": "Page", + "type": "reference" + }, + "inheritedFrom": { + "name": "PlaywrightPreNavCrawlingContext.page", + "target": 1780, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Blocks network requests matching specified URL patterns." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4295, + "module": "crawlers._playwright._playwright_pre_nav_crawling_context", + "name": "block_requests", + "parsedDocstring": { + "text": "Blocks network requests matching specified URL patterns." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 26 + } + ], + "type": { + "name": "BlockRequestsFunction", + "type": "reference", + "target": "1793" + }, + "inheritedFrom": { + "name": "PlaywrightPreNavCrawlingContext.block_requests", + "target": 1781, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Additional options to pass to Playwright's `Page.goto()` method. The `timeout` option is not supported." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4296, + "module": "crawlers._playwright._playwright_pre_nav_crawling_context", + "name": "goto_options", + "parsedDocstring": { + "text": "Additional options to pass to Playwright's `Page.goto()` method. The `timeout` option is not supported." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 29 + } + ], + "type": { + "name": "GotoOptions", + "type": "reference", + "target": "1810" + }, + "inheritedFrom": { + "name": "PlaywrightPreNavCrawlingContext.goto_options", + "target": 1782, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get snapshot of crawled page." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4297, + "module": "crawlers._playwright._playwright_pre_nav_crawling_context", + "name": "get_snapshot", + "parsedDocstring": { + "text": "Get snapshot of crawled page." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 32 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get snapshot of crawled page." + } + ] + }, + "flags": {}, + "id": 1784, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_snapshot", + "parameters": [], + "type": { + "name": "PageSnapshot", + "type": "reference", + "target": "495" + }, + "overwrites": { + "name": "BasicCrawlingContext.get_snapshot", + "target": 514, + "type": "reference" + }, + "inheritedFrom": { + "name": "PlaywrightPreNavCrawlingContext.get_snapshot", + "target": 1783, + "type": "reference" + } + } + ], + "overwrites": { + "name": "BasicCrawlingContext.get_snapshot", + "target": 514, + "type": "reference" + }, + "inheritedFrom": { + "name": "PlaywrightPreNavCrawlingContext.get_snapshot", + "target": 1783, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Request object for the current page being processed." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4298, + "module": "_types", + "name": "request", + "parsedDocstring": { + "text": "Request object for the current page being processed." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 637 + } + ], + "type": { + "name": "Request", + "type": "reference", + "target": "150" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.request", + "target": 505, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Session object for the current page being processed." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4299, + "module": "_types", + "name": "session", + "parsedDocstring": { + "text": "Session object for the current page being processed." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 640 + } + ], + "type": { + "name": "Session | None", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.session", + "target": 506, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Proxy information for the current page being processed." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4300, + "module": "_types", + "name": "proxy_info", + "parsedDocstring": { + "text": "Proxy information for the current page being processed." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 643 + } + ], + "type": { + "name": "ProxyInfo | None", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.proxy_info", + "target": 507, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Send request crawling context helper function." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4301, + "module": "_types", + "name": "send_request", + "parsedDocstring": { + "text": "Send request crawling context helper function." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 646 + } + ], + "type": { + "name": "SendRequestFunction", + "type": "reference", + "target": "488" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.send_request", + "target": 508, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Add requests crawling context helper function." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4302, + "module": "_types", + "name": "add_requests", + "parsedDocstring": { + "text": "Add requests crawling context helper function." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 649 + } + ], + "type": { + "name": "AddRequestsFunction", + "type": "reference", + "target": "422" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.add_requests", + "target": 509, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Push data crawling context helper function." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4303, + "module": "_types", + "name": "push_data", + "parsedDocstring": { + "text": "Push data crawling context helper function." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 652 + } + ], + "type": { + "name": "PushDataFunction", + "type": "reference", + "target": "480" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.push_data", + "target": 510, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Use state crawling context helper function." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4304, + "module": "_types", + "name": "use_state", + "parsedDocstring": { + "text": "Use state crawling context helper function." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 655 + } + ], + "type": { + "name": "UseStateFunction", + "type": "reference", + "target": "500" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.use_state", + "target": 511, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get key-value store crawling context helper function." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4305, + "module": "_types", + "name": "get_key_value_store", + "parsedDocstring": { + "text": "Get key-value store crawling context helper function." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 658 + } + ], + "type": { + "name": "GetKeyValueStoreFromRequestHandlerFunction", + "type": "reference", + "target": "474" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.get_key_value_store", + "target": 512, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Logger instance." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4306, + "module": "_types", + "name": "log", + "parsedDocstring": { + "text": "Logger instance." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 661 + } + ], + "type": { + "name": "logging.Logger", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.log", + "target": 513, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return hash of the context. Each context is considered unique." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4307, + "module": "_types", + "name": "__hash__", + "parsedDocstring": { + "text": "Return hash of the context. Each context is considered unique." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 668 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return hash of the context. Each context is considered unique." + } + ] + }, + "flags": {}, + "id": 517, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__hash__", + "parameters": [], + "type": { + "name": "int", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.__hash__", + "target": 516, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawlingContext.__hash__", + "target": 516, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a modified copy of the crawling context with specified changes." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4308, + "module": "_types", + "name": "create_modified_copy", + "parsedDocstring": { + "text": "Create a modified copy of the crawling context with specified changes." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 672 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a modified copy of the crawling context with specified changes." + } + ] + }, + "flags": {}, + "id": 519, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "create_modified_copy", + "parameters": [ + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 520, + "kind": 32768, + "kindString": "Parameter", + "name": "push_data", + "type": { + "name": "PushDataFunction | None", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 521, + "kind": 32768, + "kindString": "Parameter", + "name": "add_requests", + "type": { + "name": "AddRequestsFunction | None", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 522, + "kind": 32768, + "kindString": "Parameter", + "name": "get_key_value_store", + "type": { + "name": "GetKeyValueStoreFromRequestHandlerFunction | None", + "type": "reference" + } + } + ], + "type": { + "name": "Self", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.create_modified_copy", + "target": 518, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawlingContext.create_modified_copy", + "target": 518, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The crawling context used by the `PlaywrightCrawler`.\n\nIt provides access to key objects as well as utility functions for handling crawling tasks." + } + ] + }, + "decorations": [ + { + "args": "(frozen=True)", + "name": "dataclass" + }, + { + "args": "('Crawling contexts')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 4307, + 4308, + 4297 + ], + "title": "Methods" + }, + { + "children": [ + 4302, + 4295, + 1843, + 1844, + 4305, + 4296, + 1845, + 4306, + 4294, + 4300, + 4303, + 4298, + 4293, + 4301, + 4299, + 4304 + ], + "title": "Properties" + } + ], + "id": 1842, + "module": "crawlers._playwright._playwright_crawling_context", + "name": "PlaywrightCrawlingContext", + "parsedDocstring": { + "text": "The crawling context used by the `PlaywrightCrawler`.\n\nIt provides access to key objects as well as utility functions for handling crawling tasks." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_playwright/_playwright_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 18 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "PlaywrightPostNavCrawlingContext", + "target": "1846", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The Playwright `Response` object containing the response details for the current URL." + } + ] + }, + "flags": {}, + "groups": [], + "id": 1847, + "module": "crawlers._playwright._playwright_post_nav_crawling_context", + "name": "response", + "parsedDocstring": { + "text": "The Playwright `Response` object containing the response details for the current URL." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_playwright/_playwright_post_nav_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 22 + } + ], + "type": { + "name": "Response", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The Playwright `Page` object for the current page." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4278, + "module": "crawlers._playwright._playwright_pre_nav_crawling_context", + "name": "page", + "parsedDocstring": { + "text": "The Playwright `Page` object for the current page." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 23 + } + ], + "type": { + "name": "Page", + "type": "reference" + }, + "inheritedFrom": { + "name": "PlaywrightPreNavCrawlingContext.page", + "target": 1780, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Blocks network requests matching specified URL patterns." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4279, + "module": "crawlers._playwright._playwright_pre_nav_crawling_context", + "name": "block_requests", + "parsedDocstring": { + "text": "Blocks network requests matching specified URL patterns." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 26 + } + ], + "type": { + "name": "BlockRequestsFunction", + "type": "reference", + "target": "1793" + }, + "inheritedFrom": { + "name": "PlaywrightPreNavCrawlingContext.block_requests", + "target": 1781, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Additional options to pass to Playwright's `Page.goto()` method. The `timeout` option is not supported." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4280, + "module": "crawlers._playwright._playwright_pre_nav_crawling_context", + "name": "goto_options", + "parsedDocstring": { + "text": "Additional options to pass to Playwright's `Page.goto()` method. The `timeout` option is not supported." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 29 + } + ], + "type": { + "name": "GotoOptions", + "type": "reference", + "target": "1810" + }, + "inheritedFrom": { + "name": "PlaywrightPreNavCrawlingContext.goto_options", + "target": 1782, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get snapshot of crawled page." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4281, + "module": "crawlers._playwright._playwright_pre_nav_crawling_context", + "name": "get_snapshot", + "parsedDocstring": { + "text": "Get snapshot of crawled page." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 32 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get snapshot of crawled page." + } + ] + }, + "flags": {}, + "id": 1784, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_snapshot", + "parameters": [], + "type": { + "name": "PageSnapshot", + "type": "reference", + "target": "495" + }, + "overwrites": { + "name": "BasicCrawlingContext.get_snapshot", + "target": 514, + "type": "reference" + }, + "inheritedFrom": { + "name": "PlaywrightPreNavCrawlingContext.get_snapshot", + "target": 1783, + "type": "reference" + } + } + ], + "overwrites": { + "name": "BasicCrawlingContext.get_snapshot", + "target": 514, + "type": "reference" + }, + "inheritedFrom": { + "name": "PlaywrightPreNavCrawlingContext.get_snapshot", + "target": 1783, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Request object for the current page being processed." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4282, + "module": "_types", + "name": "request", + "parsedDocstring": { + "text": "Request object for the current page being processed." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 637 + } + ], + "type": { + "name": "Request", + "type": "reference", + "target": "150" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.request", + "target": 505, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Session object for the current page being processed." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4283, + "module": "_types", + "name": "session", + "parsedDocstring": { + "text": "Session object for the current page being processed." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 640 + } + ], + "type": { + "name": "Session | None", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.session", + "target": 506, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Proxy information for the current page being processed." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4284, + "module": "_types", + "name": "proxy_info", + "parsedDocstring": { + "text": "Proxy information for the current page being processed." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 643 + } + ], + "type": { + "name": "ProxyInfo | None", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.proxy_info", + "target": 507, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Send request crawling context helper function." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4285, + "module": "_types", + "name": "send_request", + "parsedDocstring": { + "text": "Send request crawling context helper function." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 646 + } + ], + "type": { + "name": "SendRequestFunction", + "type": "reference", + "target": "488" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.send_request", + "target": 508, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Add requests crawling context helper function." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4286, + "module": "_types", + "name": "add_requests", + "parsedDocstring": { + "text": "Add requests crawling context helper function." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 649 + } + ], + "type": { + "name": "AddRequestsFunction", + "type": "reference", + "target": "422" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.add_requests", + "target": 509, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Push data crawling context helper function." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4287, + "module": "_types", + "name": "push_data", + "parsedDocstring": { + "text": "Push data crawling context helper function." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 652 + } + ], + "type": { + "name": "PushDataFunction", + "type": "reference", + "target": "480" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.push_data", + "target": 510, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Use state crawling context helper function." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4288, + "module": "_types", + "name": "use_state", + "parsedDocstring": { + "text": "Use state crawling context helper function." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 655 + } + ], + "type": { + "name": "UseStateFunction", + "type": "reference", + "target": "500" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.use_state", + "target": 511, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get key-value store crawling context helper function." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4289, + "module": "_types", + "name": "get_key_value_store", + "parsedDocstring": { + "text": "Get key-value store crawling context helper function." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 658 + } + ], + "type": { + "name": "GetKeyValueStoreFromRequestHandlerFunction", + "type": "reference", + "target": "474" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.get_key_value_store", + "target": 512, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Logger instance." + } + ] + }, + "flags": {}, + "groups": [], + "id": 4290, + "module": "_types", + "name": "log", + "parsedDocstring": { + "text": "Logger instance." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 661 + } + ], + "type": { + "name": "logging.Logger", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.log", + "target": 513, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return hash of the context. Each context is considered unique." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4291, + "module": "_types", + "name": "__hash__", + "parsedDocstring": { + "text": "Return hash of the context. Each context is considered unique." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 668 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return hash of the context. Each context is considered unique." + } + ] + }, + "flags": {}, + "id": 517, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__hash__", + "parameters": [], + "type": { + "name": "int", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.__hash__", + "target": 516, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawlingContext.__hash__", + "target": 516, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a modified copy of the crawling context with specified changes." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4292, + "module": "_types", + "name": "create_modified_copy", + "parsedDocstring": { + "text": "Create a modified copy of the crawling context with specified changes." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 672 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a modified copy of the crawling context with specified changes." + } + ] + }, + "flags": {}, + "id": 519, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "create_modified_copy", + "parameters": [ + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 520, + "kind": 32768, + "kindString": "Parameter", + "name": "push_data", + "type": { + "name": "PushDataFunction | None", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 521, + "kind": 32768, + "kindString": "Parameter", + "name": "add_requests", + "type": { + "name": "AddRequestsFunction | None", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 522, + "kind": 32768, + "kindString": "Parameter", + "name": "get_key_value_store", + "type": { + "name": "GetKeyValueStoreFromRequestHandlerFunction | None", + "type": "reference" + } + } + ], + "type": { + "name": "Self", + "type": "reference" + }, + "inheritedFrom": { + "name": "BasicCrawlingContext.create_modified_copy", + "target": 518, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "BasicCrawlingContext.create_modified_copy", + "target": 518, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The post navigation crawling context used by the `PlaywrightCrawler`.\n\nIt provides access to the `Page` and `Response` objects, after the navigation to the URL is performed." + } + ] + }, + "decorations": [ + { + "args": "(frozen=True)", + "name": "dataclass" + }, + { + "args": "('Crawling contexts')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 4291, + 4292, + 4281 + ], + "title": "Methods" + }, + { + "children": [ + 4286, + 4279, + 4289, + 4280, + 4290, + 4278, + 4284, + 4287, + 4282, + 1847, + 4285, + 4283, + 4288 + ], + "title": "Properties" + } + ], + "id": 1846, + "module": "crawlers._playwright._playwright_post_nav_crawling_context", + "name": "PlaywrightPostNavCrawlingContext", + "parsedDocstring": { + "text": "The post navigation crawling context used by the `PlaywrightCrawler`.\n\nIt provides access to the `Page` and `Response` objects, after the navigation to the URL is performed." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_playwright/_playwright_post_nav_crawling_context.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 16 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "PlaywrightPreNavCrawlingContext", + "target": "1779", + "type": "reference" + } + ], + "extendedBy": [ + { + "name": "PlaywrightCrawlingContext", + "target": "1842", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1849, + "module": "crawlers._types", + "name": "reason", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 10 + } + ], + "type": { + "name": "str", + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "No reason means no blocking." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1850, + "module": "crawlers._types", + "name": "__bool__", + "parsedDocstring": { + "text": "No reason means no blocking." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 12 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "No reason means no blocking." + } + ] + }, + "flags": {}, + "id": 1851, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__bool__", + "parameters": [], + "type": { + "name": "bool", + "type": "reference" + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Information about whether the crawling is blocked. If reason is empty, then it means it is not blocked." + } + ] + }, + "decorations": [ + { + "args": "(frozen=True)", + "name": "dataclass" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 1850 + ], + "title": "Methods" + }, + { + "children": [ + 1849 + ], + "title": "Properties" + } + ], + "id": 1848, + "module": "crawlers._types", + "name": "BlockedInfo", + "parsedDocstring": { + "text": "Information about whether the crawling is blocked. If reason is empty, then it means it is not blocked." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/crawlers/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 7 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1852, + "module": "events._local_event_manager", + "name": "logger", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/events/_local_event_manager.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 22 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n\nIn most cases, you should use the `from_config` constructor to create a new instance based on\nthe provided configuration.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1854, + "module": "events._local_event_manager", + "name": "__init__", + "parsedDocstring": { + "text": "Initialize a new instance.\n\nIn most cases, you should use the `from_config` constructor to create a new instance based on\nthe provided configuration.\n", + "args": { + "system_info_interval": "Interval at which `SystemInfo` events are emitted.", + "event_manager_options": "Additional options for the parent class." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/events/_local_event_manager.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 34 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n\nIn most cases, you should use the `from_config` constructor to create a new instance based on\nthe provided configuration.\n" + } + ] + }, + "flags": {}, + "id": 1855, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Interval at which `SystemInfo` events are emitted." + } + ] + }, + "defaultValue": "timedelta(seconds=1)", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1856, + "kind": 32768, + "kindString": "Parameter", + "name": "system_info_interval", + "type": { + "name": "timedelta", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Interval between emitted `PersistState` events to maintain state persistence." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 1905, + "module": "events._event_manager", + "name": "persist_state_interval", + "parsedDocstring": { + "text": "Interval between emitted `PersistState` events to maintain state persistence." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/events/_event_manager.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 47 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "timedelta" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Optional timeout for canceling pending event listeners if they exceed this duration." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 1906, + "module": "events._event_manager", + "name": "close_timeout", + "parsedDocstring": { + "text": "Optional timeout for canceling pending event listeners if they exceed this duration." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/events/_event_manager.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 50 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "timedelta" + }, + { + "type": "literal", + "value": null + } + ] + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "EventManager.__init__", + "target": 1908, + "type": "reference" + } + } + ], + "overwrites": { + "name": "EventManager.__init__", + "target": 1908, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance based on the provided `Configuration`.\n" + } + ] + }, + "decorations": [ + { + "name": "classmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1858, + "module": "events._local_event_manager", + "name": "from_config", + "parsedDocstring": { + "text": "Initialize a new instance based on the provided `Configuration`.\n", + "args": { + "config": "The `Configuration` instance. Uses the global (default) one if not provided." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/events/_local_event_manager.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 59 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance based on the provided `Configuration`.\n" + } + ] + }, + "flags": {}, + "id": 1859, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "from_config", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The `Configuration` instance. Uses the global (default) one if not provided." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1860, + "kind": 32768, + "kindString": "Parameter", + "name": "config", + "type": { + "name": "Configuration | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Configuration", + "target": "235" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "LocalEventManager", + "type": "reference", + "target": "1853" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize the local event manager upon entering the async context.\n\nIt starts emitting system info events at regular intervals." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1861, + "module": "events._local_event_manager", + "name": "__aenter__", + "parsedDocstring": { + "text": "Initialize the local event manager upon entering the async context.\n\nIt starts emitting system info events at regular intervals." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/events/_local_event_manager.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 72 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize the local event manager upon entering the async context.\n\nIt starts emitting system info events at regular intervals." + } + ] + }, + "flags": {}, + "id": 1862, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "__aenter__", + "parameters": [], + "type": { + "name": "LocalEventManager", + "type": "reference", + "target": "1853" + }, + "overwrites": { + "name": "EventManager.__aenter__", + "target": 1913, + "type": "reference" + } + } + ], + "overwrites": { + "name": "EventManager.__aenter__", + "target": 1913, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Close the local event manager upon exiting the async context.\n\nIt stops emitting system info events and closes the event manager." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1863, + "module": "events._local_event_manager", + "name": "__aexit__", + "parsedDocstring": { + "text": "Close the local event manager upon exiting the async context.\n\nIt stops emitting system info events and closes the event manager." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/events/_local_event_manager.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 81 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Close the local event manager upon exiting the async context.\n\nIt stops emitting system info events and closes the event manager." + } + ] + }, + "flags": {}, + "id": 1864, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "__aexit__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1865, + "kind": 32768, + "kindString": "Parameter", + "name": "exc_type", + "type": { + "name": "type[BaseException] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "type", + "typeArguments": [ + { + "type": "reference", + "name": "BaseException" + } + ], + "target": "981" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1866, + "kind": 32768, + "kindString": "Parameter", + "name": "exc_value", + "type": { + "name": "BaseException | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "BaseException" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1867, + "kind": 32768, + "kindString": "Parameter", + "name": "exc_traceback", + "type": { + "name": "TracebackType | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "TracebackType" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "EventManager.__aexit__", + "target": 1915, + "type": "reference" + } + } + ], + "overwrites": { + "name": "EventManager.__aexit__", + "target": 1915, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Indicate whether the context is active." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 4005, + "module": "events._event_manager", + "name": "active", + "parsedDocstring": { + "text": "Indicate whether the context is active." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/events/_event_manager.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 100 + } + ], + "type": { + "name": "bool", + "type": "reference" + }, + "inheritedFrom": { + "name": "EventManager.active", + "target": 1912, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register an event listener for a specific event.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4006, + "module": "events._event_manager", + "name": "on", + "parsedDocstring": { + "text": "Register an event listener for a specific event.\n", + "args": { + "event": "The event for which to listen to.", + "listener": "The function (sync or async) which is to be called when the event is emitted." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/events/_event_manager.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 157 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register an event listener for a specific event.\n" + } + ] + }, + "flags": {}, + "id": 1921, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "on", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The event for which to listen to." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 1922, + "kind": 32768, + "kindString": "Parameter", + "name": "event", + "type": { + "name": "Event", + "type": "reference", + "target": "1868" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The function (sync or async) which is to be called when the event is emitted." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 1923, + "kind": 32768, + "kindString": "Parameter", + "name": "listener", + "type": { + "name": "EventListener[Any]", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "EventManager.on", + "target": 1920, + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register an event listener for a specific event.\n" + } + ] + }, + "flags": {}, + "id": 1935, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "on", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The event for which to listen to." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 1936, + "kind": 32768, + "kindString": "Parameter", + "name": "event", + "type": { + "name": "Literal[Event.PERSIST_STATE]", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The function (sync or async) which is to be called when the event is emitted." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 1937, + "kind": 32768, + "kindString": "Parameter", + "name": "listener", + "type": { + "name": "EventListener[EventPersistStateData]", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "EventManager.on", + "target": 1920, + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register an event listener for a specific event.\n" + } + ] + }, + "flags": {}, + "id": 1938, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "on", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The event for which to listen to." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 1939, + "kind": 32768, + "kindString": "Parameter", + "name": "event", + "type": { + "name": "Literal[Event.SYSTEM_INFO]", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The function (sync or async) which is to be called when the event is emitted." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 1940, + "kind": 32768, + "kindString": "Parameter", + "name": "listener", + "type": { + "name": "EventListener[EventSystemInfoData]", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "EventManager.on", + "target": 1920, + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register an event listener for a specific event.\n" + } + ] + }, + "flags": {}, + "id": 1941, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "on", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The event for which to listen to." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 1942, + "kind": 32768, + "kindString": "Parameter", + "name": "event", + "type": { + "name": "Literal[Event.MIGRATING]", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The function (sync or async) which is to be called when the event is emitted." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 1943, + "kind": 32768, + "kindString": "Parameter", + "name": "listener", + "type": { + "name": "EventListener[EventMigratingData]", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "EventManager.on", + "target": 1920, + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register an event listener for a specific event.\n" + } + ] + }, + "flags": {}, + "id": 1944, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "on", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The event for which to listen to." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 1945, + "kind": 32768, + "kindString": "Parameter", + "name": "event", + "type": { + "name": "Literal[Event.ABORTING]", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The function (sync or async) which is to be called when the event is emitted." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 1946, + "kind": 32768, + "kindString": "Parameter", + "name": "listener", + "type": { + "name": "EventListener[EventAbortingData]", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "EventManager.on", + "target": 1920, + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register an event listener for a specific event.\n" + } + ] + }, + "flags": {}, + "id": 1947, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "on", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The event for which to listen to." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 1948, + "kind": 32768, + "kindString": "Parameter", + "name": "event", + "type": { + "name": "Literal[Event.EXIT]", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The function (sync or async) which is to be called when the event is emitted." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 1949, + "kind": 32768, + "kindString": "Parameter", + "name": "listener", + "type": { + "name": "EventListener[EventExitData]", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "EventManager.on", + "target": 1920, + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register an event listener for a specific event.\n" + } + ] + }, + "flags": {}, + "id": 1950, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "on", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The event for which to listen to." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 1951, + "kind": 32768, + "kindString": "Parameter", + "name": "event", + "type": { + "name": "Literal[Event.CRAWLER_STATUS]", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The function (sync or async) which is to be called when the event is emitted." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 1952, + "kind": 32768, + "kindString": "Parameter", + "name": "listener", + "type": { + "name": "EventListener[EventCrawlerStatusData]", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "EventManager.on", + "target": 1920, + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register an event listener for a specific event.\n" + } + ] + }, + "flags": {}, + "id": 1953, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "on", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The event for which to listen to." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 1954, + "kind": 32768, + "kindString": "Parameter", + "name": "event", + "type": { + "name": "Event", + "type": "reference", + "target": "1868" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The function (sync or async) which is to be called when the event is emitted." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 1955, + "kind": 32768, + "kindString": "Parameter", + "name": "listener", + "type": { + "name": "EventListener[None]", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "EventManager.on", + "target": 1920, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "EventManager.on", + "target": 1920, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Remove a specific listener or all listeners for an event.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4007, + "module": "events._event_manager", + "name": "off", + "parsedDocstring": { + "text": "Remove a specific listener or all listeners for an event.\n", + "args": { + "event": "The Actor event for which to remove listeners.", + "listener": "The listener which is supposed to be removed. If not passed, all listeners of this event\nare removed." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/events/_event_manager.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 207 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Remove a specific listener or all listeners for an event.\n" + } + ] + }, + "flags": {}, + "id": 1925, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "off", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The Actor event for which to remove listeners." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 1926, + "kind": 32768, + "kindString": "Parameter", + "name": "event", + "type": { + "name": "Event", + "type": "reference", + "target": "1868" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The listener which is supposed to be removed. If not passed, all listeners of this event\nare removed." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1927, + "kind": 32768, + "kindString": "Parameter", + "name": "listener", + "type": { + "name": "EventListener[Any] | None", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "EventManager.off", + "target": 1924, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "EventManager.off", + "target": 1924, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Emit an event with the associated data to all registered listeners.\n" + } + ] + }, + "decorations": [ + { + "name": "ensure_context" + } + ], + "flags": {}, + "groups": [], + "id": 4008, + "module": "events._event_manager", + "name": "emit", + "parsedDocstring": { + "text": "Emit an event with the associated data to all registered listeners.\n", + "args": { + "event": "The event which will be emitted.", + "event_data": "The data which will be passed to the event listeners." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/events/_event_manager.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 239 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Emit an event with the associated data to all registered listeners.\n" + } + ] + }, + "flags": {}, + "id": 1929, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "emit", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The event which will be emitted." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 1930, + "kind": 32768, + "kindString": "Parameter", + "name": "event", + "type": { + "name": "Event", + "type": "reference", + "target": "1868" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The data which will be passed to the event listeners." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 1931, + "kind": 32768, + "kindString": "Parameter", + "name": "event_data", + "type": { + "name": "EventData", + "type": "reference", + "target": "1899" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "EventManager.emit", + "target": 1928, + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Emit an event with the associated data to all registered listeners.\n" + } + ] + }, + "flags": {}, + "id": 1956, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "emit", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The event which will be emitted." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 1957, + "kind": 32768, + "kindString": "Parameter", + "name": "event", + "type": { + "name": "Literal[Event.PERSIST_STATE]", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The data which will be passed to the event listeners." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 1958, + "kind": 32768, + "kindString": "Parameter", + "name": "event_data", + "type": { + "name": "EventPersistStateData", + "type": "reference", + "target": "1881" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "EventManager.emit", + "target": 1928, + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Emit an event with the associated data to all registered listeners.\n" + } + ] + }, + "flags": {}, + "id": 1959, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "emit", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The event which will be emitted." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 1960, + "kind": 32768, + "kindString": "Parameter", + "name": "event", + "type": { + "name": "Literal[Event.SYSTEM_INFO]", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The data which will be passed to the event listeners." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 1961, + "kind": 32768, + "kindString": "Parameter", + "name": "event_data", + "type": { + "name": "EventSystemInfoData", + "type": "reference", + "target": "1884" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "EventManager.emit", + "target": 1928, + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Emit an event with the associated data to all registered listeners.\n" + } + ] + }, + "flags": {}, + "id": 1962, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "emit", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The event which will be emitted." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 1963, + "kind": 32768, + "kindString": "Parameter", + "name": "event", + "type": { + "name": "Literal[Event.MIGRATING]", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The data which will be passed to the event listeners." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 1964, + "kind": 32768, + "kindString": "Parameter", + "name": "event_data", + "type": { + "name": "EventMigratingData", + "type": "reference", + "target": "1888" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "EventManager.emit", + "target": 1928, + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Emit an event with the associated data to all registered listeners.\n" + } + ] + }, + "flags": {}, + "id": 1965, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "emit", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The event which will be emitted." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 1966, + "kind": 32768, + "kindString": "Parameter", + "name": "event", + "type": { + "name": "Literal[Event.ABORTING]", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The data which will be passed to the event listeners." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 1967, + "kind": 32768, + "kindString": "Parameter", + "name": "event_data", + "type": { + "name": "EventAbortingData", + "type": "reference", + "target": "1891" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "EventManager.emit", + "target": 1928, + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Emit an event with the associated data to all registered listeners.\n" + } + ] + }, + "flags": {}, + "id": 1968, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "emit", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The event which will be emitted." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 1969, + "kind": 32768, + "kindString": "Parameter", + "name": "event", + "type": { + "name": "Literal[Event.EXIT]", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The data which will be passed to the event listeners." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 1970, + "kind": 32768, + "kindString": "Parameter", + "name": "event_data", + "type": { + "name": "EventExitData", + "type": "reference", + "target": "1893" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "EventManager.emit", + "target": 1928, + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Emit an event with the associated data to all registered listeners.\n" + } + ] + }, + "flags": {}, + "id": 1971, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "emit", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The event which will be emitted." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 1972, + "kind": 32768, + "kindString": "Parameter", + "name": "event", + "type": { + "name": "Literal[Event.CRAWLER_STATUS]", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The data which will be passed to the event listeners." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 1973, + "kind": 32768, + "kindString": "Parameter", + "name": "event_data", + "type": { + "name": "EventCrawlerStatusData", + "type": "reference", + "target": "1895" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "EventManager.emit", + "target": 1928, + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Emit an event with the associated data to all registered listeners.\n" + } + ] + }, + "flags": {}, + "id": 1974, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "emit", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The event which will be emitted." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 1975, + "kind": 32768, + "kindString": "Parameter", + "name": "event", + "type": { + "name": "Event", + "type": "reference", + "target": "1868" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The data which will be passed to the event listeners." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 1976, + "kind": 32768, + "kindString": "Parameter", + "name": "event_data", + "type": { + "name": "Any", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "EventManager.emit", + "target": 1928, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "EventManager.emit", + "target": 1928, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Wait for all currently executing event listeners to complete.\n" + } + ] + }, + "decorations": [ + { + "name": "ensure_context" + } + ], + "flags": {}, + "groups": [], + "id": 4009, + "module": "events._event_manager", + "name": "wait_for_all_listeners_to_complete", + "parsedDocstring": { + "text": "Wait for all currently executing event listeners to complete.\n", + "args": { + "timeout": "The maximum time to wait for the event listeners to finish. If they do not complete within\nthe specified timeout, they will be canceled." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/events/_event_manager.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 249 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Wait for all currently executing event listeners to complete.\n" + } + ] + }, + "flags": {}, + "id": 1933, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "wait_for_all_listeners_to_complete", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The maximum time to wait for the event listeners to finish. If they do not complete within\nthe specified timeout, they will be canceled." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1934, + "kind": 32768, + "kindString": "Parameter", + "name": "timeout", + "type": { + "name": "timedelta | None", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "EventManager.wait_for_all_listeners_to_complete", + "target": 1932, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "EventManager.wait_for_all_listeners_to_complete", + "target": 1932, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Event manager for local environments.\n\nIt extends the `EventManager` to emit `SystemInfo` events at regular intervals. The `LocalEventManager`\nis intended to be used in local environments, where the system metrics are required managing the `Snapshotter`\nand `AutoscaledPool`." + } + ] + }, + "decorations": [ + { + "args": "('Event managers')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 1861, + 1863, + 1854, + 4008, + 1858, + 4007, + 4006, + 4009 + ], + "title": "Methods" + }, + { + "children": [ + 4005 + ], + "title": "Properties" + } + ], + "id": 1853, + "module": "events._local_event_manager", + "name": "LocalEventManager", + "parsedDocstring": { + "text": "Event manager for local environments.\n\nIt extends the `EventManager` to emit `SystemInfo` events at regular intervals. The `LocalEventManager`\nis intended to be used in local environments, where the system metrics are required managing the `Snapshotter`\nand `AutoscaledPool`." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/events/_local_event_manager.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 26 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "EventManager", + "target": "1907", + "type": "reference" + } + ] + }, + { + "kind": 8, + "kindString": "Enumeration", + "children": [ + { + "kind": 16, + "kindString": "Enumeration Member", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1869, + "module": "events._types", + "name": "PERSIST_STATE", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/events/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 19 + } + ], + "type": { + "type": "literal", + "value": "'persistState'" + } + }, + { + "kind": 16, + "kindString": "Enumeration Member", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1870, + "module": "events._types", + "name": "SYSTEM_INFO", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/events/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 20 + } + ], + "type": { + "type": "literal", + "value": "'systemInfo'" + } + }, + { + "kind": 16, + "kindString": "Enumeration Member", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1871, + "module": "events._types", + "name": "MIGRATING", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/events/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 21 + } + ], + "type": { + "type": "literal", + "value": "'migrating'" + } + }, + { + "kind": 16, + "kindString": "Enumeration Member", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1872, + "module": "events._types", + "name": "ABORTING", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/events/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 22 + } + ], + "type": { + "type": "literal", + "value": "'aborting'" + } + }, + { + "kind": 16, + "kindString": "Enumeration Member", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1873, + "module": "events._types", + "name": "EXIT", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/events/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 23 + } + ], + "type": { + "type": "literal", + "value": "'exit'" + } + }, + { + "kind": 16, + "kindString": "Enumeration Member", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1874, + "module": "events._types", + "name": "SESSION_RETIRED", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/events/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 26 + } + ], + "type": { + "type": "literal", + "value": "'sessionRetired'" + } + }, + { + "kind": 16, + "kindString": "Enumeration Member", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1875, + "module": "events._types", + "name": "BROWSER_LAUNCHED", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/events/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 29 + } + ], + "type": { + "type": "literal", + "value": "'browserLaunched'" + } + }, + { + "kind": 16, + "kindString": "Enumeration Member", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1876, + "module": "events._types", + "name": "BROWSER_RETIRED", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/events/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 30 + } + ], + "type": { + "type": "literal", + "value": "'browserRetired'" + } + }, + { + "kind": 16, + "kindString": "Enumeration Member", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1877, + "module": "events._types", + "name": "BROWSER_CLOSED", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/events/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 31 + } + ], + "type": { + "type": "literal", + "value": "'browserClosed'" + } + }, + { + "kind": 16, + "kindString": "Enumeration Member", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1878, + "module": "events._types", + "name": "PAGE_CREATED", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/events/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 32 + } + ], + "type": { + "type": "literal", + "value": "'pageCreated'" + } + }, + { + "kind": 16, + "kindString": "Enumeration Member", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1879, + "module": "events._types", + "name": "PAGE_CLOSED", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/events/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 33 + } + ], + "type": { + "type": "literal", + "value": "'pageClosed'" + } + }, + { + "kind": 16, + "kindString": "Enumeration Member", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1880, + "module": "events._types", + "name": "CRAWLER_STATUS", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/events/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 36 + } + ], + "type": { + "type": "literal", + "value": "'crawlerStatus'" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Names of all possible events that can be emitted using an `EventManager`." + } + ] + }, + "decorations": [ + { + "args": "('Event data')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 1872, + 1877, + 1875, + 1876, + 1880, + 1873, + 1871, + 1879, + 1878, + 1869, + 1874, + 1870 + ], + "title": "Enumeration members" + } + ], + "id": 1868, + "module": "events._types", + "name": "Event", + "parsedDocstring": { + "text": "Names of all possible events that can be emitted using an `EventManager`." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/events/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 15 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1882, + "module": "events._types", + "name": "model_config", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/events/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 43 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1883, + "module": "events._types", + "name": "is_migrating", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/events/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 45 + } + ], + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Data for the persist state event." + } + ] + }, + "decorations": [ + { + "args": "('Event data')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 1883, + 1882 + ], + "title": "Properties" + } + ], + "id": 1881, + "module": "events._types", + "name": "EventPersistStateData", + "parsedDocstring": { + "text": "Data for the persist state event." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/events/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 40 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1885, + "module": "events._types", + "name": "model_config", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/events/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 52 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1886, + "module": "events._types", + "name": "cpu_info", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/events/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 54 + } + ], + "type": { + "name": "CpuInfo", + "type": "reference", + "target": "887" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1887, + "module": "events._types", + "name": "memory_info", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/events/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 55 + } + ], + "type": { + "name": "MemoryUsageInfo", + "type": "reference", + "target": "890" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Data for the system info event." + } + ] + }, + "decorations": [ + { + "args": "('Event data')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 1886, + 1887, + 1885 + ], + "title": "Properties" + } + ], + "id": 1884, + "module": "events._types", + "name": "EventSystemInfoData", + "parsedDocstring": { + "text": "Data for the system info event." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/events/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 49 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1889, + "module": "events._types", + "name": "model_config", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/events/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 65 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1890, + "module": "events._types", + "name": "time_remaining", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/events/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 69 + } + ], + "type": { + "name": "Annotated[timedelta_secs | None, Field(alias='timeRemainingSecs')]", + "type": "union", + "types": [ + { + "type": "reference", + "name": "timedelta_secs", + "target": "791" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Data for the migrating event." + } + ] + }, + "decorations": [ + { + "args": "('Event data')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 1889, + 1890 + ], + "title": "Properties" + } + ], + "id": 1888, + "module": "events._types", + "name": "EventMigratingData", + "parsedDocstring": { + "text": "Data for the migrating event." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/events/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 62 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1892, + "module": "events._types", + "name": "model_config", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/events/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 76 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Data for the aborting event." + } + ] + }, + "decorations": [ + { + "args": "('Event data')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 1892 + ], + "title": "Properties" + } + ], + "id": 1891, + "module": "events._types", + "name": "EventAbortingData", + "parsedDocstring": { + "text": "Data for the aborting event." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/events/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 73 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1894, + "module": "events._types", + "name": "model_config", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/events/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 83 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Data for the exit event." + } + ] + }, + "decorations": [ + { + "args": "('Event data')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 1894 + ], + "title": "Properties" + } + ], + "id": 1893, + "module": "events._types", + "name": "EventExitData", + "parsedDocstring": { + "text": "Data for the exit event." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/events/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 80 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1896, + "module": "events._types", + "name": "model_config", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/events/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 90 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A message describing the current status of the crawler." + } + ] + }, + "flags": {}, + "groups": [], + "id": 1897, + "module": "events._types", + "name": "message", + "parsedDocstring": { + "text": "A message describing the current status of the crawler." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/events/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 92 + } + ], + "type": { + "name": "str", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The ID of the crawler that emitted the event." + } + ] + }, + "flags": {}, + "groups": [], + "id": 1898, + "module": "events._types", + "name": "crawler_id", + "parsedDocstring": { + "text": "The ID of the crawler that emitted the event." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/events/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 95 + } + ], + "type": { + "name": "int", + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Data for the crawler status event." + } + ] + }, + "decorations": [ + { + "args": "('Event data')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 1898, + 1897, + 1896 + ], + "title": "Properties" + } + ], + "id": 1895, + "module": "events._types", + "name": "EventCrawlerStatusData", + "parsedDocstring": { + "text": "Data for the crawler status event." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/events/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 87 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A helper type for all possible event payloads" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1899, + "module": "events._types", + "name": "EventData", + "parsedDocstring": { + "text": "A helper type for all possible event payloads" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/events/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 99 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1900, + "module": "events._types", + "name": "WrappedListener", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/events/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 109 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1901, + "module": "events._types", + "name": "TEvent", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/events/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 111 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "An event listener function - it can be both sync and async and may accept zero or one argument." + } + ] + }, + "flags": {}, + "groups": [], + "id": 1902, + "module": "events._types", + "name": "EventListener", + "parsedDocstring": { + "text": "An event listener function - it can be both sync and async and may accept zero or one argument." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/events/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 112 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1903, + "module": "events._event_manager", + "name": "logger", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/events/_event_manager.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 38 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Interval between emitted `PersistState` events to maintain state persistence." + } + ] + }, + "flags": {}, + "groups": [], + "id": 1905, + "module": "events._event_manager", + "name": "persist_state_interval", + "parsedDocstring": { + "text": "Interval between emitted `PersistState` events to maintain state persistence." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/events/_event_manager.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 47 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "timedelta" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Optional timeout for canceling pending event listeners if they exceed this duration." + } + ] + }, + "flags": {}, + "groups": [], + "id": 1906, + "module": "events._event_manager", + "name": "close_timeout", + "parsedDocstring": { + "text": "Optional timeout for canceling pending event listeners if they exceed this duration." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/events/_event_manager.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 50 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "timedelta" + }, + { + "type": "literal", + "value": null + } + ] + } + ] + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Arguments for the `EventManager` constructor.\n\nIt is intended for typing forwarded `__init__` arguments in the subclasses." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 1906, + 1905 + ], + "title": "Properties" + } + ], + "id": 1904, + "module": "events._event_manager", + "name": "EventManagerOptions", + "parsedDocstring": { + "text": "Arguments for the `EventManager` constructor.\n\nIt is intended for typing forwarded `__init__` arguments in the subclasses." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/events/_event_manager.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 41 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1908, + "module": "events._event_manager", + "name": "__init__", + "parsedDocstring": { + "text": "Initialize a new instance.\n", + "args": { + "persist_state_interval": "Interval between emitted `PersistState` events to maintain state persistence.", + "close_timeout": "Optional timeout for canceling pending event listeners if they exceed this duration." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/events/_event_manager.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 63 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n" + } + ] + }, + "flags": {}, + "id": 1909, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Interval between emitted `PersistState` events to maintain state persistence." + } + ] + }, + "defaultValue": "timedelta(minutes=1)", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1910, + "kind": 32768, + "kindString": "Parameter", + "name": "persist_state_interval", + "type": { + "name": "timedelta", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Optional timeout for canceling pending event listeners if they exceed this duration." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1911, + "kind": 32768, + "kindString": "Parameter", + "name": "close_timeout", + "type": { + "name": "timedelta | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "timedelta" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Indicate whether the context is active." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 1912, + "module": "events._event_manager", + "name": "active", + "parsedDocstring": { + "text": "Indicate whether the context is active." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/events/_event_manager.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 100 + } + ], + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize the event manager upon entering the async context.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1913, + "module": "events._event_manager", + "name": "__aenter__", + "parsedDocstring": { + "text": "Initialize the event manager upon entering the async context.\n" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/events/_event_manager.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 104 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize the event manager upon entering the async context.\n" + } + ] + }, + "flags": {}, + "id": 1914, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "__aenter__", + "parameters": [], + "type": { + "name": "EventManager", + "type": "reference", + "target": "1907" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Close the local event manager upon exiting the async context.\n\nThis will stop listening for the events, and it will wait for all the event listeners to finish.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1915, + "module": "events._event_manager", + "name": "__aexit__", + "parsedDocstring": { + "text": "Close the local event manager upon exiting the async context.\n\nThis will stop listening for the events, and it will wait for all the event listeners to finish.\n" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/events/_event_manager.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 117 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Close the local event manager upon exiting the async context.\n\nThis will stop listening for the events, and it will wait for all the event listeners to finish.\n" + } + ] + }, + "flags": {}, + "id": 1916, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "__aexit__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1917, + "kind": 32768, + "kindString": "Parameter", + "name": "exc_type", + "type": { + "name": "type[BaseException] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "type", + "typeArguments": [ + { + "type": "reference", + "name": "BaseException" + } + ], + "target": "981" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1918, + "kind": 32768, + "kindString": "Parameter", + "name": "exc_value", + "type": { + "name": "BaseException | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "BaseException" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1919, + "kind": 32768, + "kindString": "Parameter", + "name": "exc_traceback", + "type": { + "name": "TracebackType | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "TracebackType" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register an event listener for a specific event.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1920, + "module": "events._event_manager", + "name": "on", + "parsedDocstring": { + "text": "Register an event listener for a specific event.\n", + "args": { + "event": "The event for which to listen to.", + "listener": "The function (sync or async) which is to be called when the event is emitted." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/events/_event_manager.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 157 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register an event listener for a specific event.\n" + } + ] + }, + "flags": {}, + "id": 1921, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "on", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The event for which to listen to." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 1922, + "kind": 32768, + "kindString": "Parameter", + "name": "event", + "type": { + "name": "Event", + "type": "reference", + "target": "1868" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The function (sync or async) which is to be called when the event is emitted." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 1923, + "kind": 32768, + "kindString": "Parameter", + "name": "listener", + "type": { + "name": "EventListener", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Any" + } + ], + "target": "1902" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register an event listener for a specific event.\n" + } + ] + }, + "flags": {}, + "id": 1935, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "on", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The event for which to listen to." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 1936, + "kind": 32768, + "kindString": "Parameter", + "name": "event", + "type": { + "name": "Literal", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Event.PERSIST_STATE" + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The function (sync or async) which is to be called when the event is emitted." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 1937, + "kind": 32768, + "kindString": "Parameter", + "name": "listener", + "type": { + "name": "EventListener", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "EventPersistStateData", + "target": "1881" + } + ], + "target": "1902" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register an event listener for a specific event.\n" + } + ] + }, + "flags": {}, + "id": 1938, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "on", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The event for which to listen to." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 1939, + "kind": 32768, + "kindString": "Parameter", + "name": "event", + "type": { + "name": "Literal", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Event.SYSTEM_INFO" + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The function (sync or async) which is to be called when the event is emitted." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 1940, + "kind": 32768, + "kindString": "Parameter", + "name": "listener", + "type": { + "name": "EventListener", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "EventSystemInfoData", + "target": "1884" + } + ], + "target": "1902" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register an event listener for a specific event.\n" + } + ] + }, + "flags": {}, + "id": 1941, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "on", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The event for which to listen to." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 1942, + "kind": 32768, + "kindString": "Parameter", + "name": "event", + "type": { + "name": "Literal", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Event.MIGRATING" + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The function (sync or async) which is to be called when the event is emitted." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 1943, + "kind": 32768, + "kindString": "Parameter", + "name": "listener", + "type": { + "name": "EventListener", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "EventMigratingData", + "target": "1888" + } + ], + "target": "1902" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register an event listener for a specific event.\n" + } + ] + }, + "flags": {}, + "id": 1944, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "on", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The event for which to listen to." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 1945, + "kind": 32768, + "kindString": "Parameter", + "name": "event", + "type": { + "name": "Literal", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Event.ABORTING" + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The function (sync or async) which is to be called when the event is emitted." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 1946, + "kind": 32768, + "kindString": "Parameter", + "name": "listener", + "type": { + "name": "EventListener", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "EventAbortingData", + "target": "1891" + } + ], + "target": "1902" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register an event listener for a specific event.\n" + } + ] + }, + "flags": {}, + "id": 1947, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "on", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The event for which to listen to." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 1948, + "kind": 32768, + "kindString": "Parameter", + "name": "event", + "type": { + "name": "Literal", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Event.EXIT" + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The function (sync or async) which is to be called when the event is emitted." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 1949, + "kind": 32768, + "kindString": "Parameter", + "name": "listener", + "type": { + "name": "EventListener", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "EventExitData", + "target": "1893" + } + ], + "target": "1902" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register an event listener for a specific event.\n" + } + ] + }, + "flags": {}, + "id": 1950, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "on", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The event for which to listen to." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 1951, + "kind": 32768, + "kindString": "Parameter", + "name": "event", + "type": { + "name": "Literal", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Event.CRAWLER_STATUS" + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The function (sync or async) which is to be called when the event is emitted." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 1952, + "kind": 32768, + "kindString": "Parameter", + "name": "listener", + "type": { + "name": "EventListener", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "EventCrawlerStatusData", + "target": "1895" + } + ], + "target": "1902" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Register an event listener for a specific event.\n" + } + ] + }, + "flags": {}, + "id": 1953, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "on", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The event for which to listen to." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 1954, + "kind": 32768, + "kindString": "Parameter", + "name": "event", + "type": { + "name": "Event", + "type": "reference", + "target": "1868" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The function (sync or async) which is to be called when the event is emitted." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 1955, + "kind": 32768, + "kindString": "Parameter", + "name": "listener", + "type": { + "name": "EventListener", + "type": "reference", + "typeArguments": [ + { + "type": "literal", + "value": null + } + ], + "target": "1902" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Remove a specific listener or all listeners for an event.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1924, + "module": "events._event_manager", + "name": "off", + "parsedDocstring": { + "text": "Remove a specific listener or all listeners for an event.\n", + "args": { + "event": "The Actor event for which to remove listeners.", + "listener": "The listener which is supposed to be removed. If not passed, all listeners of this event\nare removed." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/events/_event_manager.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 207 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Remove a specific listener or all listeners for an event.\n" + } + ] + }, + "flags": {}, + "id": 1925, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "off", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The Actor event for which to remove listeners." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 1926, + "kind": 32768, + "kindString": "Parameter", + "name": "event", + "type": { + "name": "Event", + "type": "reference", + "target": "1868" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The listener which is supposed to be removed. If not passed, all listeners of this event\nare removed." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1927, + "kind": 32768, + "kindString": "Parameter", + "name": "listener", + "type": { + "name": "EventListener[Any] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "EventListener", + "typeArguments": [ + { + "type": "reference", + "name": "Any" + } + ], + "target": "1902" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Emit an event with the associated data to all registered listeners.\n" + } + ] + }, + "decorations": [ + { + "name": "ensure_context" + } + ], + "flags": {}, + "groups": [], + "id": 1928, + "module": "events._event_manager", + "name": "emit", + "parsedDocstring": { + "text": "Emit an event with the associated data to all registered listeners.\n", + "args": { + "event": "The event which will be emitted.", + "event_data": "The data which will be passed to the event listeners." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/events/_event_manager.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 239 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Emit an event with the associated data to all registered listeners.\n" + } + ] + }, + "flags": {}, + "id": 1929, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "emit", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The event which will be emitted." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 1930, + "kind": 32768, + "kindString": "Parameter", + "name": "event", + "type": { + "name": "Event", + "type": "reference", + "target": "1868" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The data which will be passed to the event listeners." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 1931, + "kind": 32768, + "kindString": "Parameter", + "name": "event_data", + "type": { + "name": "EventData", + "type": "reference", + "target": "1899" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Emit an event with the associated data to all registered listeners.\n" + } + ] + }, + "flags": {}, + "id": 1956, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "emit", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The event which will be emitted." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 1957, + "kind": 32768, + "kindString": "Parameter", + "name": "event", + "type": { + "name": "Literal", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Event.PERSIST_STATE" + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The data which will be passed to the event listeners." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 1958, + "kind": 32768, + "kindString": "Parameter", + "name": "event_data", + "type": { + "name": "EventPersistStateData", + "type": "reference", + "target": "1881" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Emit an event with the associated data to all registered listeners.\n" + } + ] + }, + "flags": {}, + "id": 1959, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "emit", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The event which will be emitted." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 1960, + "kind": 32768, + "kindString": "Parameter", + "name": "event", + "type": { + "name": "Literal", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Event.SYSTEM_INFO" + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The data which will be passed to the event listeners." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 1961, + "kind": 32768, + "kindString": "Parameter", + "name": "event_data", + "type": { + "name": "EventSystemInfoData", + "type": "reference", + "target": "1884" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Emit an event with the associated data to all registered listeners.\n" + } + ] + }, + "flags": {}, + "id": 1962, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "emit", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The event which will be emitted." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 1963, + "kind": 32768, + "kindString": "Parameter", + "name": "event", + "type": { + "name": "Literal", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Event.MIGRATING" + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The data which will be passed to the event listeners." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 1964, + "kind": 32768, + "kindString": "Parameter", + "name": "event_data", + "type": { + "name": "EventMigratingData", + "type": "reference", + "target": "1888" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Emit an event with the associated data to all registered listeners.\n" + } + ] + }, + "flags": {}, + "id": 1965, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "emit", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The event which will be emitted." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 1966, + "kind": 32768, + "kindString": "Parameter", + "name": "event", + "type": { + "name": "Literal", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Event.ABORTING" + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The data which will be passed to the event listeners." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 1967, + "kind": 32768, + "kindString": "Parameter", + "name": "event_data", + "type": { + "name": "EventAbortingData", + "type": "reference", + "target": "1891" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Emit an event with the associated data to all registered listeners.\n" + } + ] + }, + "flags": {}, + "id": 1968, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "emit", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The event which will be emitted." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 1969, + "kind": 32768, + "kindString": "Parameter", + "name": "event", + "type": { + "name": "Literal", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Event.EXIT" + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The data which will be passed to the event listeners." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 1970, + "kind": 32768, + "kindString": "Parameter", + "name": "event_data", + "type": { + "name": "EventExitData", + "type": "reference", + "target": "1893" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Emit an event with the associated data to all registered listeners.\n" + } + ] + }, + "flags": {}, + "id": 1971, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "emit", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The event which will be emitted." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 1972, + "kind": 32768, + "kindString": "Parameter", + "name": "event", + "type": { + "name": "Literal", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Event.CRAWLER_STATUS" + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The data which will be passed to the event listeners." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 1973, + "kind": 32768, + "kindString": "Parameter", + "name": "event_data", + "type": { + "name": "EventCrawlerStatusData", + "type": "reference", + "target": "1895" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Emit an event with the associated data to all registered listeners.\n" + } + ] + }, + "flags": {}, + "id": 1974, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "emit", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The event which will be emitted." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 1975, + "kind": 32768, + "kindString": "Parameter", + "name": "event", + "type": { + "name": "Event", + "type": "reference", + "target": "1868" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The data which will be passed to the event listeners." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 1976, + "kind": 32768, + "kindString": "Parameter", + "name": "event_data", + "type": { + "name": "Any", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Wait for all currently executing event listeners to complete.\n" + } + ] + }, + "decorations": [ + { + "name": "ensure_context" + } + ], + "flags": {}, + "groups": [], + "id": 1932, + "module": "events._event_manager", + "name": "wait_for_all_listeners_to_complete", + "parsedDocstring": { + "text": "Wait for all currently executing event listeners to complete.\n", + "args": { + "timeout": "The maximum time to wait for the event listeners to finish. If they do not complete within\nthe specified timeout, they will be canceled." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/events/_event_manager.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 249 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Wait for all currently executing event listeners to complete.\n" + } + ] + }, + "flags": {}, + "id": 1933, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "wait_for_all_listeners_to_complete", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The maximum time to wait for the event listeners to finish. If they do not complete within\nthe specified timeout, they will be canceled." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1934, + "kind": 32768, + "kindString": "Parameter", + "name": "timeout", + "type": { + "name": "timedelta | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "timedelta" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Manage events and their listeners, enabling registration, emission, and execution control.\n\nIt allows for registering event listeners, emitting events, and ensuring all listeners complete their execution.\nBuilt on top of `pyee.asyncio.AsyncIOEventEmitter`. It implements additional features such as waiting for all\nlisteners to complete and emitting `PersistState` events at regular intervals." + } + ] + }, + "decorations": [ + { + "args": "('Event managers')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 1913, + 1915, + 1908, + 1928, + 1924, + 1920, + 1932 + ], + "title": "Methods" + }, + { + "children": [ + 1912 + ], + "title": "Properties" + } + ], + "id": 1907, + "module": "events._event_manager", + "name": "EventManager", + "parsedDocstring": { + "text": "Manage events and their listeners, enabling registration, emission, and execution control.\n\nIt allows for registering event listeners, emitting events, and ensuring all listeners complete their execution.\nBuilt on top of `pyee.asyncio.AsyncIOEventEmitter`. It implements additional features such as waiting for all\nlisteners to complete and emitting `PersistState` events at regular intervals." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/events/_event_manager.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 55 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedBy": [ + { + "name": "LocalEventManager", + "target": "1853", + "type": "reference" + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1977, + "module": "fingerprint_suite._consts", + "name": "COMMON_ACCEPT_LANGUAGE", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/fingerprint_suite/_consts.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 3 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 1978, + "module": "fingerprint_suite._consts", + "name": "BROWSER_TYPE_HEADER_KEYWORD", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/fingerprint_suite/_consts.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 5 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Generate browser fingerprints.\n\nThis is experimental feature.\nReturn type is temporarily set to `Fingerprint` from `browserforge`. This is subject to change and most likely\nit will change to custom `Fingerprint` class defined in this repo later." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 1980, + "module": "fingerprint_suite._fingerprint_generator", + "name": "generate", + "parsedDocstring": { + "text": "Generate browser fingerprints.\n\nThis is experimental feature.\nReturn type is temporarily set to `Fingerprint` from `browserforge`. This is subject to change and most likely\nit will change to custom `Fingerprint` class defined in this repo later." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/fingerprint_suite/_fingerprint_generator.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 17 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Generate browser fingerprints.\n\nThis is experimental feature.\nReturn type is temporarily set to `Fingerprint` from `browserforge`. This is subject to change and most likely\nit will change to custom `Fingerprint` class defined in this repo later." + } + ] + }, + "flags": {}, + "id": 1981, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "generate", + "parameters": [], + "type": { + "name": "Fingerprint", + "type": "reference" + }, + "overwrites": { + "name": "FingerprintGenerator.generate", + "target": 1980, + "type": "reference" + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A class for creating browser fingerprints that mimic browser fingerprints of real users." + } + ] + }, + "decorations": [ + { + "args": "('Other')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 1980 + ], + "title": "Methods" + } + ], + "id": 1979, + "module": "fingerprint_suite._fingerprint_generator", + "name": "FingerprintGenerator", + "parsedDocstring": { + "text": "A class for creating browser fingerprints that mimic browser fingerprints of real users." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/fingerprint_suite/_fingerprint_generator.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 13 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedBy": [ + { + "name": "BrowserforgeFingerprintGenerator", + "target": "2039", + "type": "reference" + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1982, + "module": "fingerprint_suite._header_generator", + "name": "fingerprint_browser_type_from_playwright_browser_type", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/fingerprint_suite/_header_generator.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 13 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 1983, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "fingerprint_browser_type_from_playwright_browser_type", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 1984, + "kind": 32768, + "kindString": "Parameter", + "name": "playwright_browser_type", + "type": { + "name": "Literal", + "type": "reference", + "typeArguments": [ + { + "type": "literal", + "value": "chromium" + }, + { + "type": "literal", + "value": "firefox" + }, + { + "type": "literal", + "value": "webkit" + }, + { + "type": "literal", + "value": "chrome" + } + ] + } + } + ], + "type": { + "name": "SupportedBrowserType", + "type": "reference", + "target": "2005" + } + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1986, + "module": "fingerprint_suite._header_generator", + "name": "__init__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/fingerprint_suite/_header_generator.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 29 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 1987, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return subset of headers based on the selected `header_names`.\n\nIf no `header_names` are specified, full unfiltered headers are returned." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1988, + "module": "fingerprint_suite._header_generator", + "name": "get_specific_headers", + "parsedDocstring": { + "text": "Return subset of headers based on the selected `header_names`.\n\nIf no `header_names` are specified, full unfiltered headers are returned." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/fingerprint_suite/_header_generator.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 35 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return subset of headers based on the selected `header_names`.\n\nIf no `header_names` are specified, full unfiltered headers are returned." + } + ] + }, + "flags": {}, + "id": 1989, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "get_specific_headers", + "parameters": [ + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1990, + "kind": 32768, + "kindString": "Parameter", + "name": "header_names", + "type": { + "name": "set[str] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "set", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ], + "target": "2562" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "'chrome'", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 1991, + "kind": 32768, + "kindString": "Parameter", + "name": "browser_type", + "type": { + "name": "SupportedBrowserType", + "type": "reference", + "target": "2005" + } + } + ], + "type": { + "name": "HttpHeaders", + "type": "reference", + "target": "306" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get common HTTP headers (\"Accept\", \"Accept-Language\").\n\nWe do not modify the \"Accept-Encoding\", \"Connection\" and other headers. They should be included and handled\nby the HTTP client or browser." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1992, + "module": "fingerprint_suite._header_generator", + "name": "get_common_headers", + "parsedDocstring": { + "text": "Get common HTTP headers (\"Accept\", \"Accept-Language\").\n\nWe do not modify the \"Accept-Encoding\", \"Connection\" and other headers. They should be included and handled\nby the HTTP client or browser." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/fingerprint_suite/_header_generator.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 48 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get common HTTP headers (\"Accept\", \"Accept-Language\").\n\nWe do not modify the \"Accept-Encoding\", \"Connection\" and other headers. They should be included and handled\nby the HTTP client or browser." + } + ] + }, + "flags": {}, + "id": 1993, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "get_common_headers", + "parameters": [], + "type": { + "name": "HttpHeaders", + "type": "reference", + "target": "306" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get a random User-Agent header." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1994, + "module": "fingerprint_suite._header_generator", + "name": "get_random_user_agent_header", + "parsedDocstring": { + "text": "Get a random User-Agent header." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/fingerprint_suite/_header_generator.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 57 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get a random User-Agent header." + } + ] + }, + "flags": {}, + "id": 1995, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "get_random_user_agent_header", + "parameters": [], + "type": { + "name": "HttpHeaders", + "type": "reference", + "target": "306" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the User-Agent header based on the browser type." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1996, + "module": "fingerprint_suite._header_generator", + "name": "get_user_agent_header", + "parsedDocstring": { + "text": "Get the User-Agent header based on the browser type." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/fingerprint_suite/_header_generator.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 62 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the User-Agent header based on the browser type." + } + ] + }, + "flags": {}, + "id": 1997, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "get_user_agent_header", + "parameters": [ + { + "defaultValue": "'chrome'", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 1998, + "kind": 32768, + "kindString": "Parameter", + "name": "browser_type", + "type": { + "name": "SupportedBrowserType", + "type": "reference", + "target": "2005" + } + } + ], + "type": { + "name": "HttpHeaders", + "type": "reference", + "target": "306" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the sec-ch-ua headers based on the browser type." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 1999, + "module": "fingerprint_suite._header_generator", + "name": "get_sec_ch_ua_headers", + "parsedDocstring": { + "text": "Get the sec-ch-ua headers based on the browser type." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/fingerprint_suite/_header_generator.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 73 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the sec-ch-ua headers based on the browser type." + } + ] + }, + "flags": {}, + "id": 2000, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "get_sec_ch_ua_headers", + "parameters": [ + { + "defaultValue": "'chrome'", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2001, + "kind": 32768, + "kindString": "Parameter", + "name": "browser_type", + "type": { + "name": "SupportedBrowserType", + "type": "reference", + "target": "2005" + } + } + ], + "type": { + "name": "HttpHeaders", + "type": "reference", + "target": "306" + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Generate realistic looking or browser-like HTTP headers." + } + ] + }, + "decorations": [ + { + "args": "('Other')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 1986, + 1992, + 1994, + 1999, + 1988, + 1996 + ], + "title": "Methods" + } + ], + "id": 1985, + "module": "fingerprint_suite._header_generator", + "name": "HeaderGenerator", + "parsedDocstring": { + "text": "Generate realistic looking or browser-like HTTP headers." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/fingerprint_suite/_header_generator.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 26 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2002, + "module": "fingerprint_suite._types", + "name": "SupportedOperatingSystems", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/fingerprint_suite/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 7 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2003, + "module": "fingerprint_suite._types", + "name": "SupportedDevices", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/fingerprint_suite/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 8 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2004, + "module": "fingerprint_suite._types", + "name": "SupportedHttpVersion", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/fingerprint_suite/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 9 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2005, + "module": "fingerprint_suite._types", + "name": "SupportedBrowserType", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/fingerprint_suite/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 10 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Defines the screen constrains for the fingerprint generator." + } + ] + }, + "flags": {}, + "groups": [], + "id": 2007, + "module": "fingerprint_suite._types", + "name": "model_config", + "parsedDocstring": { + "text": "Defines the screen constrains for the fingerprint generator." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/fingerprint_suite/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 14 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Minimal screen width constraint for the fingerprint generator." + } + ] + }, + "flags": {}, + "groups": [], + "id": 2008, + "module": "fingerprint_suite._types", + "name": "min_width", + "parsedDocstring": { + "text": "Minimal screen width constraint for the fingerprint generator." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/fingerprint_suite/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 18 + } + ], + "type": { + "name": "Annotated[float | None, Field(alias='minWidth')]", + "type": "union", + "types": [ + { + "type": "reference", + "name": "float" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximal screen width constraint for the fingerprint generator." + } + ] + }, + "flags": {}, + "groups": [], + "id": 2009, + "module": "fingerprint_suite._types", + "name": "max_width", + "parsedDocstring": { + "text": "Maximal screen width constraint for the fingerprint generator." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/fingerprint_suite/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 21 + } + ], + "type": { + "name": "Annotated[float | None, Field(alias='maxWidth')]", + "type": "union", + "types": [ + { + "type": "reference", + "name": "float" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Minimal screen height constraint for the fingerprint generator." + } + ] + }, + "flags": {}, + "groups": [], + "id": 2010, + "module": "fingerprint_suite._types", + "name": "min_height", + "parsedDocstring": { + "text": "Minimal screen height constraint for the fingerprint generator." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/fingerprint_suite/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 24 + } + ], + "type": { + "name": "Annotated[float | None, Field(alias='minHeight')]", + "type": "union", + "types": [ + { + "type": "reference", + "name": "float" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximal screen height constraint for the fingerprint generator." + } + ] + }, + "flags": {}, + "groups": [], + "id": 2011, + "module": "fingerprint_suite._types", + "name": "max_height", + "parsedDocstring": { + "text": "Maximal screen height constraint for the fingerprint generator." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/fingerprint_suite/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 27 + } + ], + "type": { + "name": "Annotated[float | None, Field(alias='maxHeight')]", + "type": "union", + "types": [ + { + "type": "reference", + "name": "float" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 2011, + 2009, + 2010, + 2008, + 2007 + ], + "title": "Properties" + } + ], + "id": 2006, + "module": "fingerprint_suite._types", + "name": "ScreenOptions", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/fingerprint_suite/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 13 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2013, + "module": "fingerprint_suite._types", + "name": "model_config", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/fingerprint_suite/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 34 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "List of BrowserSpecifications to generate the headers for." + } + ] + }, + "flags": {}, + "groups": [], + "id": 2014, + "module": "fingerprint_suite._types", + "name": "browsers", + "parsedDocstring": { + "text": "List of BrowserSpecifications to generate the headers for." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/fingerprint_suite/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 36 + } + ], + "type": { + "name": "list[SupportedBrowserType] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "SupportedBrowserType", + "target": "2005" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "List of operating systems to generate the headers for." + } + ] + }, + "flags": {}, + "groups": [], + "id": 2015, + "module": "fingerprint_suite._types", + "name": "operating_systems", + "parsedDocstring": { + "text": "List of operating systems to generate the headers for." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/fingerprint_suite/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 39 + } + ], + "type": { + "name": "Annotated[list[SupportedOperatingSystems] | None, Field(alias='operatingSystems')]", + "type": "union", + "types": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "SupportedOperatingSystems", + "target": "2002" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "List of devices to generate the headers for." + } + ] + }, + "flags": {}, + "groups": [], + "id": 2016, + "module": "fingerprint_suite._types", + "name": "devices", + "parsedDocstring": { + "text": "List of devices to generate the headers for." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/fingerprint_suite/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 42 + } + ], + "type": { + "name": "list[SupportedDevices] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "SupportedDevices", + "target": "2003" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "List of at most 10 languages to include in the [Accept-Language]\n(https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Accept-Language) request header\nin the language format accepted by that header, for example `en`, `en-US` or `de`." + } + ] + }, + "flags": {}, + "groups": [], + "id": 2017, + "module": "fingerprint_suite._types", + "name": "locales", + "parsedDocstring": { + "text": "List of at most 10 languages to include in the [Accept-Language]\n(https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Accept-Language) request header\nin the language format accepted by that header, for example `en`, `en-US` or `de`." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/fingerprint_suite/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 45 + } + ], + "type": { + "name": "list[str] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP version to be used for header generation (the headers differ depending on the version)." + } + ] + }, + "flags": {}, + "groups": [], + "id": 2018, + "module": "fingerprint_suite._types", + "name": "http_version", + "parsedDocstring": { + "text": "HTTP version to be used for header generation (the headers differ depending on the version)." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/fingerprint_suite/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 50 + } + ], + "type": { + "name": "Annotated[SupportedHttpVersion | None, Field(alias='httpVersion')]", + "type": "union", + "types": [ + { + "type": "reference", + "name": "SupportedHttpVersion", + "target": "2004" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If true, the generator will throw an error if it cannot generate headers based on the input." + } + ] + }, + "flags": {}, + "groups": [], + "id": 2019, + "module": "fingerprint_suite._types", + "name": "strict", + "parsedDocstring": { + "text": "If true, the generator will throw an error if it cannot generate headers based on the input." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/fingerprint_suite/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 53 + } + ], + "type": { + "name": "bool | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "bool" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Collection of header related attributes that can be used by the fingerprint generator." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 2014, + 2016, + 2018, + 2017, + 2013, + 2015, + 2019 + ], + "title": "Properties" + } + ], + "id": 2012, + "module": "fingerprint_suite._types", + "name": "HeaderGeneratorOptions", + "parsedDocstring": { + "text": "Collection of header related attributes that can be used by the fingerprint generator." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/fingerprint_suite/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 31 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Generate HTTP headers based on the specified parameters.\n\nFor detailed description of the original method see: `browserforge.headers.generator.HeaderGenerator.generate`\nThis patched version of the method adds additional quality checks on the output of the original method. It tries\nto generate headers several times until they match the requirements.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2021, + "module": "fingerprint_suite._browserforge_adapter", + "name": "generate", + "parsedDocstring": { + "text": "Generate HTTP headers based on the specified parameters.\n\nFor detailed description of the original method see: `browserforge.headers.generator.HeaderGenerator.generate`\nThis patched version of the method adds additional quality checks on the output of the original method. It tries\nto generate headers several times until they match the requirements.\n", + "returns": "A generated headers." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/fingerprint_suite/_browserforge_adapter.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 56 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "A generated headers." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Generate HTTP headers based on the specified parameters.\n\nFor detailed description of the original method see: `browserforge.headers.generator.HeaderGenerator.generate`\nThis patched version of the method adds additional quality checks on the output of the original method. It tries\nto generate headers several times until they match the requirements.\n" + } + ] + }, + "flags": {}, + "id": 2022, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "generate", + "parameters": [ + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2023, + "kind": 32768, + "kindString": "Parameter", + "name": "browser", + "type": { + "name": "Iterable[str | Browser] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Iterable", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Browser" + } + ] + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2024, + "kind": 32768, + "kindString": "Parameter", + "name": "os", + "type": { + "name": "ListOrString | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ListOrString" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2025, + "kind": 32768, + "kindString": "Parameter", + "name": "device", + "type": { + "name": "ListOrString | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ListOrString" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2026, + "kind": 32768, + "kindString": "Parameter", + "name": "locale", + "type": { + "name": "ListOrString | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ListOrString" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2027, + "kind": 32768, + "kindString": "Parameter", + "name": "http_version", + "type": { + "name": "Literal[1, 2] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Literal", + "typeArguments": [ + { + "type": "literal", + "value": 1 + }, + { + "type": "literal", + "value": 2 + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2028, + "kind": 32768, + "kindString": "Parameter", + "name": "user_agent", + "type": { + "name": "ListOrString | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ListOrString" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2029, + "kind": 32768, + "kindString": "Parameter", + "name": "strict", + "type": { + "name": "bool | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "bool" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2030, + "kind": 32768, + "kindString": "Parameter", + "name": "request_dependent_headers", + "type": { + "name": "dict[str, str] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "dict", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "str" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "dict", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "str" + } + ] + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Browserforge `HeaderGenerator` that contains patches specific for our usage of the generator." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 2021 + ], + "title": "Methods" + } + ], + "id": 2020, + "module": "fingerprint_suite._browserforge_adapter", + "name": "PatchedHeaderGenerator", + "parsedDocstring": { + "text": "Browserforge `HeaderGenerator` that contains patches specific for our usage of the generator." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/fingerprint_suite/_browserforge_adapter.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 30 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2032, + "module": "fingerprint_suite._browserforge_adapter", + "name": "__init__", + "parsedDocstring": { + "text": "Initialize a new instance.\n", + "args": { + "screen": "Screen constraints for the generated fingerprint.", + "strict": "Whether to raise an exception if the constraints are too strict.", + "mock_webrtc": "Whether to mock WebRTC when injecting the fingerprint.", + "slim": "Disables performance-heavy evasions when injecting the fingerprint.", + "**header_kwargs": "Header generation options for `HeaderGenerator`." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/fingerprint_suite/_browserforge_adapter.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 157 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n" + } + ] + }, + "flags": {}, + "id": 2033, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Screen constraints for the generated fingerprint." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2034, + "kind": 32768, + "kindString": "Parameter", + "name": "screen", + "type": { + "name": "Screen | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Screen" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Whether to raise an exception if the constraints are too strict." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2035, + "kind": 32768, + "kindString": "Parameter", + "name": "strict", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Whether to mock WebRTC when injecting the fingerprint." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2036, + "kind": 32768, + "kindString": "Parameter", + "name": "mock_webrtc", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Disables performance-heavy evasions when injecting the fingerprint." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2037, + "kind": 32768, + "kindString": "Parameter", + "name": "slim", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2038, + "kind": 32768, + "kindString": "Parameter", + "name": "header_kwargs", + "type": { + "name": "Undefined", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Browserforge `FingerprintGenerator` that contains patches not accepted in upstream repo." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 2032 + ], + "title": "Methods" + } + ], + "id": 2031, + "module": "fingerprint_suite._browserforge_adapter", + "name": "PatchedFingerprintGenerator", + "parsedDocstring": { + "text": "Browserforge `FingerprintGenerator` that contains patches not accepted in upstream repo." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/fingerprint_suite/_browserforge_adapter.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 154 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n\nAll generator options are optional. If any value is not specified, then `None` is set in the options.\nDefault values for options set to `None` are implementation detail of used fingerprint generator.\nSpecific default values should not be relied upon. Use explicit values if it matters for your use case.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2040, + "module": "fingerprint_suite._browserforge_adapter", + "name": "__init__", + "parsedDocstring": { + "text": "Initialize a new instance.\n\nAll generator options are optional. If any value is not specified, then `None` is set in the options.\nDefault values for options set to `None` are implementation detail of used fingerprint generator.\nSpecific default values should not be relied upon. Use explicit values if it matters for your use case.\n", + "args": { + "header_options": "Collection of header related attributes that can be used by the fingerprint generator.", + "screen_options": "Defines the screen constrains for the fingerprint generator.", + "mock_web_rtc": "Whether to mock WebRTC when injecting the fingerprint.", + "slim": "Disables performance-heavy evasions when injecting the fingerprint." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/fingerprint_suite/_browserforge_adapter.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 187 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n\nAll generator options are optional. If any value is not specified, then `None` is set in the options.\nDefault values for options set to `None` are implementation detail of used fingerprint generator.\nSpecific default values should not be relied upon. Use explicit values if it matters for your use case.\n" + } + ] + }, + "flags": {}, + "id": 2041, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Collection of header related attributes that can be used by the fingerprint generator." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2042, + "kind": 32768, + "kindString": "Parameter", + "name": "header_options", + "type": { + "name": "HeaderGeneratorOptions | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "HeaderGeneratorOptions", + "target": "2012" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Defines the screen constrains for the fingerprint generator." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2043, + "kind": 32768, + "kindString": "Parameter", + "name": "screen_options", + "type": { + "name": "ScreenOptions | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ScreenOptions", + "target": "2006" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Whether to mock WebRTC when injecting the fingerprint." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2044, + "kind": 32768, + "kindString": "Parameter", + "name": "mock_web_rtc", + "type": { + "name": "bool | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "bool" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Disables performance-heavy evasions when injecting the fingerprint." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2045, + "kind": 32768, + "kindString": "Parameter", + "name": "slim", + "type": { + "name": "bool | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "bool" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Generate browser fingerprints.\n\nThis is experimental feature.\nReturn type is temporarily set to `Fingerprint` from `browserforge`. This is subject to change and most likely\nit will change to custom `Fingerprint` class defined in this repo later." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2046, + "module": "fingerprint_suite._fingerprint_generator", + "name": "generate", + "parsedDocstring": { + "text": "Generate browser fingerprints.\n\nThis is experimental feature.\nReturn type is temporarily set to `Fingerprint` from `browserforge`. This is subject to change and most likely\nit will change to custom `Fingerprint` class defined in this repo later." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/fingerprint_suite/_browserforge_adapter.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 227 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Generate browser fingerprints.\n\nThis is experimental feature.\nReturn type is temporarily set to `Fingerprint` from `browserforge`. This is subject to change and most likely\nit will change to custom `Fingerprint` class defined in this repo later." + } + ] + }, + "flags": {}, + "id": 1981, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "generate", + "parameters": [], + "type": { + "name": "Fingerprint", + "type": "reference" + }, + "overwrites": { + "name": "FingerprintGenerator.generate", + "target": 1980, + "type": "reference" + } + } + ], + "overwrites": { + "name": "FingerprintGenerator.generate", + "target": 1980, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "`FingerprintGenerator` adapter for fingerprint generator from `browserforge`.\n\n`browserforge` is a browser header and fingerprint generator: https://github.com/daijro/browserforge" + } + ] + }, + "decorations": [ + { + "args": "('Other')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 2040, + 2046 + ], + "title": "Methods" + } + ], + "id": 2039, + "module": "fingerprint_suite._browserforge_adapter", + "name": "BrowserforgeFingerprintGenerator", + "parsedDocstring": { + "text": "`FingerprintGenerator` adapter for fingerprint generator from `browserforge`.\n\n`browserforge` is a browser header and fingerprint generator: https://github.com/daijro/browserforge" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/fingerprint_suite/_browserforge_adapter.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 181 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "FingerprintGenerator", + "target": "1979", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2049, + "module": "fingerprint_suite._browserforge_adapter", + "name": "__init__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/fingerprint_suite/_browserforge_adapter.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 245 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 2050, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Generate headers." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2051, + "module": "fingerprint_suite._browserforge_adapter", + "name": "generate", + "parsedDocstring": { + "text": "Generate headers." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/fingerprint_suite/_browserforge_adapter.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 248 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Generate headers." + } + ] + }, + "flags": {}, + "id": 2052, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "generate", + "parameters": [ + { + "defaultValue": "'chrome'", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 2053, + "kind": 32768, + "kindString": "Parameter", + "name": "browser_type", + "type": { + "name": "SupportedBrowserType", + "type": "reference", + "target": "2005" + } + } + ], + "type": { + "name": "dict", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "str" + } + ] + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "`HeaderGenerator` adapter for fingerprint generator from `browserforge`." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 2049, + 2051 + ], + "title": "Methods" + } + ], + "id": 2048, + "module": "fingerprint_suite._browserforge_adapter", + "name": "BrowserforgeHeaderGenerator", + "parsedDocstring": { + "text": "`HeaderGenerator` adapter for fingerprint generator from `browserforge`." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/fingerprint_suite/_browserforge_adapter.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 242 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get header network that contains possible header values." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2054, + "module": "fingerprint_suite._browserforge_adapter", + "name": "get_available_header_network", + "parsedDocstring": { + "text": "Get header network that contains possible header values." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/fingerprint_suite/_browserforge_adapter.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 253 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get header network that contains possible header values." + } + ] + }, + "flags": {}, + "id": 2055, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "get_available_header_network", + "parameters": [], + "type": { + "name": "dict", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get set of possible header values from available header network." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2056, + "module": "fingerprint_suite._browserforge_adapter", + "name": "get_available_header_values", + "parsedDocstring": { + "text": "Get set of possible header values from available header network." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/fingerprint_suite/_browserforge_adapter.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 258 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get set of possible header values from available header network." + } + ] + }, + "flags": {}, + "id": 2057, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "get_available_header_values", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2058, + "kind": 32768, + "kindString": "Parameter", + "name": "header_network", + "type": { + "name": "dict", + "type": "reference" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2059, + "kind": 32768, + "kindString": "Parameter", + "name": "node_name", + "type": { + "name": "str | set[str]", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "set", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ], + "target": "2562" + } + ] + } + } + ], + "type": { + "name": "set", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ], + "target": "2562" + } + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The HTTP version used in the response." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 2061, + "module": "http_clients._base", + "name": "http_version", + "parsedDocstring": { + "text": "The HTTP version used in the response." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/http_clients/_base.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 27 + } + ], + "type": { + "name": "str", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The HTTP status code received from the server." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 2062, + "module": "http_clients._base", + "name": "status_code", + "parsedDocstring": { + "text": "The HTTP status code received from the server." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/http_clients/_base.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 31 + } + ], + "type": { + "name": "int", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The HTTP headers received in the response." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 2063, + "module": "http_clients._base", + "name": "headers", + "parsedDocstring": { + "text": "The HTTP headers received in the response." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/http_clients/_base.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 35 + } + ], + "type": { + "name": "HttpHeaders", + "type": "reference", + "target": "306" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Read the entire content of the response body.\n\nThis method loads the complete response body into memory at once. It should be used\nfor responses received from regular HTTP requests (via `send_request` or `crawl` methods).\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2064, + "module": "http_clients._base", + "name": "read", + "parsedDocstring": { + "text": "Read the entire content of the response body.\n\nThis method loads the complete response body into memory at once. It should be used\nfor responses received from regular HTTP requests (via `send_request` or `crawl` methods).\n" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/http_clients/_base.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 38 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Read the entire content of the response body.\n\nThis method loads the complete response body into memory at once. It should be used\nfor responses received from regular HTTP requests (via `send_request` or `crawl` methods).\n" + } + ] + }, + "flags": {}, + "id": 2065, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "read", + "parameters": [], + "type": { + "name": "bytes", + "type": "reference", + "target": "682" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Iterate over the content of the response body in chunks.\n\nThis method should be used for responses received from the `stream` method to process\nlarge response bodies without loading them entirely into memory. It allows for efficient\nprocessing of potentially large data by yielding chunks sequentially.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2066, + "module": "http_clients._base", + "name": "read_stream", + "parsedDocstring": { + "text": "Iterate over the content of the response body in chunks.\n\nThis method should be used for responses received from the `stream` method to process\nlarge response bodies without loading them entirely into memory. It allows for efficient\nprocessing of potentially large data by yielding chunks sequentially.\n" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/http_clients/_base.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 48 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Iterate over the content of the response body in chunks.\n\nThis method should be used for responses received from the `stream` method to process\nlarge response bodies without loading them entirely into memory. It allows for efficient\nprocessing of potentially large data by yielding chunks sequentially.\n" + } + ] + }, + "flags": {}, + "id": 2067, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "read_stream", + "parameters": [], + "type": { + "name": "AsyncIterator", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bytes", + "target": "682" + } + ] + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Define the interface that any HTTP response object must implement." + } + ] + }, + "decorations": [ + { + "args": "('Other')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 2064, + 2066 + ], + "title": "Methods" + }, + { + "children": [ + 2063, + 2061, + 2062 + ], + "title": "Properties" + } + ], + "id": 2060, + "module": "http_clients._base", + "name": "HttpResponse", + "parsedDocstring": { + "text": "Define the interface that any HTTP response object must implement." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/http_clients/_base.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 23 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The HTTP response received from the server." + } + ] + }, + "flags": {}, + "groups": [], + "id": 2069, + "module": "http_clients._base", + "name": "http_response", + "parsedDocstring": { + "text": "The HTTP response received from the server." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/http_clients/_base.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 70 + } + ], + "type": { + "name": "HttpResponse", + "type": "reference", + "target": "2060" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Result of an HTTP-only crawl.\n\nMainly for the purpose of composing specific crawling contexts (e.g. `BeautifulSoupCrawlingContext`,\n`ParselCrawlingContext`, ...)." + } + ] + }, + "decorations": [ + { + "args": "(frozen=True)", + "name": "dataclass" + }, + { + "args": "('Crawling contexts')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 2069 + ], + "title": "Properties" + } + ], + "id": 2068, + "module": "http_clients._base", + "name": "HttpCrawlingResult", + "parsedDocstring": { + "text": "Result of an HTTP-only crawl.\n\nMainly for the purpose of composing specific crawling contexts (e.g. `BeautifulSoupCrawlingContext`,\n`ParselCrawlingContext`, ...)." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/http_clients/_base.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 63 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedBy": [ + { + "name": "HttpCrawlingContext", + "target": "1245", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2071, + "module": "http_clients._base", + "name": "__init__", + "parsedDocstring": { + "text": "Initialize a new instance.\n", + "args": { + "persist_cookies_per_session": "Whether to persist cookies per HTTP session." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/http_clients/_base.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 79 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n" + } + ] + }, + "flags": {}, + "id": 2072, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Whether to persist cookies per HTTP session." + } + ] + }, + "defaultValue": "True", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2073, + "kind": 32768, + "kindString": "Parameter", + "name": "persist_cookies_per_session", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Indicate whether the context is active." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 2074, + "module": "http_clients._base", + "name": "active", + "parsedDocstring": { + "text": "Indicate whether the context is active." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/http_clients/_base.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 95 + } + ], + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Perform the crawling for a given request.\n\nThis method is called from `crawler.run()`.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2075, + "module": "http_clients._base", + "name": "crawl", + "parsedDocstring": { + "text": "Perform the crawling for a given request.\n\nThis method is called from `crawler.run()`.\n", + "args": { + "request": "The request to be crawled.", + "session": "The session associated with the request.", + "proxy_info": "The information about the proxy to be used.", + "statistics": "The statistics object to register status codes.", + "timeout": "Maximum time allowed to process the request.\n" + }, + "returns": "The result of the crawling." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/http_clients/_base.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 100 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "The result of the crawling." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Perform the crawling for a given request.\n\nThis method is called from `crawler.run()`.\n" + } + ] + }, + "flags": {}, + "id": 2076, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "crawl", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The request to be crawled." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2077, + "kind": 32768, + "kindString": "Parameter", + "name": "request", + "type": { + "name": "Request", + "type": "reference", + "target": "150" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The session associated with the request." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2078, + "kind": 32768, + "kindString": "Parameter", + "name": "session", + "type": { + "name": "Session | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Session", + "target": "2445" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The information about the proxy to be used." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2079, + "kind": 32768, + "kindString": "Parameter", + "name": "proxy_info", + "type": { + "name": "ProxyInfo | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ProxyInfo", + "target": "254" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The statistics object to register status codes." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2080, + "kind": 32768, + "kindString": "Parameter", + "name": "statistics", + "type": { + "name": "Statistics | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Statistics", + "target": "2707" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum time allowed to process the request.\n" + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2081, + "kind": 32768, + "kindString": "Parameter", + "name": "timeout", + "type": { + "name": "timedelta | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "timedelta" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "HttpCrawlingResult", + "type": "reference", + "target": "2068" + }, + "overwrites": { + "name": "HttpClient.crawl", + "target": 2075, + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Send an HTTP request via the client.\n\nThis method is called from `context.send_request()` helper.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2082, + "module": "http_clients._base", + "name": "send_request", + "parsedDocstring": { + "text": "Send an HTTP request via the client.\n\nThis method is called from `context.send_request()` helper.\n", + "args": { + "url": "The URL to send the request to.", + "method": "The HTTP method to use.", + "headers": "The headers to include in the request.", + "payload": "The data to be sent as the request body.", + "session": "The session associated with the request.", + "proxy_info": "The information about the proxy to be used.", + "timeout": "Maximum time allowed to process the request.\n" + }, + "returns": "The HTTP response received from the server." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/http_clients/_base.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 128 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "The HTTP response received from the server." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Send an HTTP request via the client.\n\nThis method is called from `context.send_request()` helper.\n" + } + ] + }, + "flags": {}, + "id": 2083, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "send_request", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The URL to send the request to." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2084, + "kind": 32768, + "kindString": "Parameter", + "name": "url", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The HTTP method to use." + } + ] + }, + "defaultValue": "'GET'", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2085, + "kind": 32768, + "kindString": "Parameter", + "name": "method", + "type": { + "name": "HttpMethod", + "type": "reference", + "target": "300" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The headers to include in the request." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2086, + "kind": 32768, + "kindString": "Parameter", + "name": "headers", + "type": { + "name": "HttpHeaders | dict[str, str] | None", + "type": "union", + "types": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "HttpHeaders", + "target": "306" + }, + { + "type": "reference", + "name": "dict", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "str" + } + ] + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The data to be sent as the request body." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2087, + "kind": 32768, + "kindString": "Parameter", + "name": "payload", + "type": { + "name": "HttpPayload | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "HttpPayload", + "target": "301" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The session associated with the request." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2088, + "kind": 32768, + "kindString": "Parameter", + "name": "session", + "type": { + "name": "Session | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Session", + "target": "2445" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The information about the proxy to be used." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2089, + "kind": 32768, + "kindString": "Parameter", + "name": "proxy_info", + "type": { + "name": "ProxyInfo | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ProxyInfo", + "target": "254" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum time allowed to process the request.\n" + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2090, + "kind": 32768, + "kindString": "Parameter", + "name": "timeout", + "type": { + "name": "timedelta | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "timedelta" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "HttpResponse", + "type": "reference", + "target": "2060" + }, + "overwrites": { + "name": "HttpClient.send_request", + "target": 2082, + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Stream an HTTP request via the client.\n\nThis method should be used for downloading potentially large data where you need to process\nthe response body in chunks rather than loading it entirely into memory.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2091, + "module": "http_clients._base", + "name": "stream", + "parsedDocstring": { + "text": "Stream an HTTP request via the client.\n\nThis method should be used for downloading potentially large data where you need to process\nthe response body in chunks rather than loading it entirely into memory.\n", + "args": { + "url": "The URL to send the request to.", + "method": "The HTTP method to use.", + "headers": "The headers to include in the request.", + "payload": "The data to be sent as the request body.", + "session": "The session associated with the request.", + "proxy_info": "The information about the proxy to be used.", + "timeout": "The maximum time to wait for establishing the connection.\n" + }, + "returns": "An async context manager yielding the HTTP response with streaming capabilities." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/http_clients/_base.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 160 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "An async context manager yielding the HTTP response with streaming capabilities." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Stream an HTTP request via the client.\n\nThis method should be used for downloading potentially large data where you need to process\nthe response body in chunks rather than loading it entirely into memory.\n" + } + ] + }, + "flags": {}, + "id": 2092, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "stream", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The URL to send the request to." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2093, + "kind": 32768, + "kindString": "Parameter", + "name": "url", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The HTTP method to use." + } + ] + }, + "defaultValue": "'GET'", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2094, + "kind": 32768, + "kindString": "Parameter", + "name": "method", + "type": { + "name": "HttpMethod", + "type": "reference", + "target": "300" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The headers to include in the request." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2095, + "kind": 32768, + "kindString": "Parameter", + "name": "headers", + "type": { + "name": "HttpHeaders | dict[str, str] | None", + "type": "union", + "types": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "HttpHeaders", + "target": "306" + }, + { + "type": "reference", + "name": "dict", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "str" + } + ] + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The data to be sent as the request body." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2096, + "kind": 32768, + "kindString": "Parameter", + "name": "payload", + "type": { + "name": "HttpPayload | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "HttpPayload", + "target": "301" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The session associated with the request." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2097, + "kind": 32768, + "kindString": "Parameter", + "name": "session", + "type": { + "name": "Session | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Session", + "target": "2445" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The information about the proxy to be used." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2098, + "kind": 32768, + "kindString": "Parameter", + "name": "proxy_info", + "type": { + "name": "ProxyInfo | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ProxyInfo", + "target": "254" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The maximum time to wait for establishing the connection.\n" + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2099, + "kind": 32768, + "kindString": "Parameter", + "name": "timeout", + "type": { + "name": "timedelta | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "timedelta" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "AbstractAsyncContextManager", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "HttpResponse", + "target": "2060" + } + ] + }, + "overwrites": { + "name": "HttpClient.stream", + "target": 2091, + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Clean up resources used by the client.\n\nThis method is called when the client is no longer needed and should be overridden\nin subclasses to perform any necessary cleanup such as closing connections,\nreleasing file handles, or other resource deallocation." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2100, + "module": "http_clients._base", + "name": "cleanup", + "parsedDocstring": { + "text": "Clean up resources used by the client.\n\nThis method is called when the client is no longer needed and should be overridden\nin subclasses to perform any necessary cleanup such as closing connections,\nreleasing file handles, or other resource deallocation." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/http_clients/_base.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 193 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Clean up resources used by the client.\n\nThis method is called when the client is no longer needed and should be overridden\nin subclasses to perform any necessary cleanup such as closing connections,\nreleasing file handles, or other resource deallocation." + } + ] + }, + "flags": {}, + "id": 2101, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "cleanup", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "HttpClient.cleanup", + "target": 2100, + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize the client when entering the context manager.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2102, + "module": "http_clients._base", + "name": "__aenter__", + "parsedDocstring": { + "text": "Initialize the client when entering the context manager.\n" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/http_clients/_base.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 201 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize the client when entering the context manager.\n" + } + ] + }, + "flags": {}, + "id": 2103, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "__aenter__", + "parameters": [], + "type": { + "name": "HttpClient", + "type": "reference", + "target": "2070" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Deinitialize the client and clean up resources when exiting the context manager.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2104, + "module": "http_clients._base", + "name": "__aexit__", + "parsedDocstring": { + "text": "Deinitialize the client and clean up resources when exiting the context manager.\n" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/http_clients/_base.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 213 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Deinitialize the client and clean up resources when exiting the context manager.\n" + } + ] + }, + "flags": {}, + "id": 2105, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "__aexit__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2106, + "kind": 32768, + "kindString": "Parameter", + "name": "exc_type", + "type": { + "name": "BaseException | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "BaseException" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2107, + "kind": 32768, + "kindString": "Parameter", + "name": "exc_value", + "type": { + "name": "BaseException | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "BaseException" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2108, + "kind": 32768, + "kindString": "Parameter", + "name": "traceback", + "type": { + "name": "TracebackType | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "TracebackType" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "An abstract base class for HTTP clients used in crawlers (`BasicCrawler` subclasses)." + } + ] + }, + "decorations": [ + { + "args": "('HTTP clients')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 2102, + 2104, + 2071, + 2100, + 2075, + 2082, + 2091 + ], + "title": "Methods" + }, + { + "children": [ + 2074 + ], + "title": "Properties" + } + ], + "id": 2070, + "module": "http_clients._base", + "name": "HttpClient", + "parsedDocstring": { + "text": "An abstract base class for HTTP clients used in crawlers (`BasicCrawler` subclasses)." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/http_clients/_base.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 75 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedBy": [ + { + "name": "PlaywrightHttpClient", + "target": "1749", + "type": "reference" + }, + { + "name": "CurlImpersonateHttpClient", + "target": "2132", + "type": "reference" + }, + { + "name": "ImpitHttpClient", + "target": "2179", + "type": "reference" + }, + { + "name": "HttpxHttpClient", + "target": "2230", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "name": "override" + } + ], + "flags": {}, + "groups": [], + "id": 2110, + "module": "http_clients._curl_impersonate", + "name": "get_cookies_for_curl", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/http_clients/_curl_impersonate.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 43 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 2111, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "get_cookies_for_curl", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2112, + "kind": 32768, + "kindString": "Parameter", + "name": "request", + "type": { + "name": "CurlRequest", + "type": "reference" + } + } + ], + "type": { + "name": "list", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "CurlMorsel" + } + ] + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "name": "override" + } + ], + "flags": {}, + "groups": [], + "id": 2113, + "module": "http_clients._curl_impersonate", + "name": "update_cookies_from_curl", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/http_clients/_curl_impersonate.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 47 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 2114, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "update_cookies_from_curl", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2115, + "kind": 32768, + "kindString": "Parameter", + "name": "morsels", + "type": { + "name": "list", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "CurlMorsel" + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 2110, + 2113 + ], + "title": "Methods" + } + ], + "id": 2109, + "module": "http_clients._curl_impersonate", + "name": "_EmptyCookies", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/http_clients/_curl_impersonate.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 41 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "name": "override" + } + ], + "flags": {}, + "groups": [], + "id": 2117, + "module": "http_clients._curl_impersonate", + "name": "__init__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/http_clients/_curl_impersonate.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 53 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 2118, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2119, + "kind": 32768, + "kindString": "Parameter", + "name": "args", + "type": { + "name": "Any", + "type": "reference" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2120, + "kind": 32768, + "kindString": "Parameter", + "name": "kwargs", + "type": { + "name": "Any", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 2117 + ], + "title": "Methods" + } + ], + "id": 2116, + "module": "http_clients._curl_impersonate", + "name": "_AsyncSession", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/http_clients/_curl_impersonate.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 51 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2122, + "module": "http_clients._curl_impersonate", + "name": "__init__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/http_clients/_curl_impersonate.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 61 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 2123, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2124, + "kind": 32768, + "kindString": "Parameter", + "name": "response", + "type": { + "name": "Response", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 2125, + "module": "http_clients._curl_impersonate", + "name": "http_version", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/http_clients/_curl_impersonate.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 65 + } + ], + "type": { + "name": "str", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 2126, + "module": "http_clients._curl_impersonate", + "name": "status_code", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/http_clients/_curl_impersonate.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 84 + } + ], + "type": { + "name": "int", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 2127, + "module": "http_clients._curl_impersonate", + "name": "headers", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/http_clients/_curl_impersonate.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 88 + } + ], + "type": { + "name": "HttpHeaders", + "type": "reference", + "target": "306" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2128, + "module": "http_clients._curl_impersonate", + "name": "read", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/http_clients/_curl_impersonate.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 91 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 2129, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "read", + "parameters": [], + "type": { + "name": "bytes", + "type": "reference", + "target": "682" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2130, + "module": "http_clients._curl_impersonate", + "name": "read_stream", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/http_clients/_curl_impersonate.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 97 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 2131, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "read_stream", + "parameters": [], + "type": { + "name": "AsyncGenerator", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bytes", + "target": "682" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Adapter class for `curl_cffi.requests.Response` to conform to the `HttpResponse` protocol." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 2122, + 2128, + 2130 + ], + "title": "Methods" + }, + { + "children": [ + 2127, + 2125, + 2126 + ], + "title": "Properties" + } + ], + "id": 2121, + "module": "http_clients._curl_impersonate", + "name": "_CurlImpersonateResponse", + "parsedDocstring": { + "text": "Adapter class for `curl_cffi.requests.Response` to conform to the `HttpResponse` protocol." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/http_clients/_curl_impersonate.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 58 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2133, + "module": "http_clients._curl_impersonate", + "name": "__init__", + "parsedDocstring": { + "text": "Initialize a new instance.\n", + "args": { + "persist_cookies_per_session": "Whether to persist cookies per HTTP session.", + "async_session_kwargs": "Additional keyword arguments for `curl_cffi.requests.AsyncSession`." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/http_clients/_curl_impersonate.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 128 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n" + } + ] + }, + "flags": {}, + "id": 2134, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Whether to persist cookies per HTTP session." + } + ] + }, + "defaultValue": "True", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2135, + "kind": 32768, + "kindString": "Parameter", + "name": "persist_cookies_per_session", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Additional keyword arguments for `curl_cffi.requests.AsyncSession`." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2136, + "kind": 32768, + "kindString": "Parameter", + "name": "async_session_kwargs", + "type": { + "name": "Any", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "HttpClient.__init__", + "target": 2071, + "type": "reference" + } + } + ], + "overwrites": { + "name": "HttpClient.__init__", + "target": 2071, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Perform the crawling for a given request.\n\nThis method is called from `crawler.run()`.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2137, + "module": "http_clients._base", + "name": "crawl", + "parsedDocstring": { + "text": "Perform the crawling for a given request.\n\nThis method is called from `crawler.run()`.\n", + "args": { + "request": "The request to be crawled.", + "session": "The session associated with the request.", + "proxy_info": "The information about the proxy to be used.", + "statistics": "The statistics object to register status codes.", + "timeout": "Maximum time allowed to process the request.\n" + }, + "returns": "The result of the crawling." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/http_clients/_curl_impersonate.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 148 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "The result of the crawling." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Perform the crawling for a given request.\n\nThis method is called from `crawler.run()`.\n" + } + ] + }, + "flags": {}, + "id": 2076, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "crawl", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The request to be crawled." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2077, + "kind": 32768, + "kindString": "Parameter", + "name": "request", + "type": { + "name": "Request", + "type": "reference", + "target": "150" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The session associated with the request." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2078, + "kind": 32768, + "kindString": "Parameter", + "name": "session", + "type": { + "name": "Session | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Session", + "target": "2445" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The information about the proxy to be used." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2079, + "kind": 32768, + "kindString": "Parameter", + "name": "proxy_info", + "type": { + "name": "ProxyInfo | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ProxyInfo", + "target": "254" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The statistics object to register status codes." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2080, + "kind": 32768, + "kindString": "Parameter", + "name": "statistics", + "type": { + "name": "Statistics | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Statistics", + "target": "2707" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum time allowed to process the request.\n" + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2081, + "kind": 32768, + "kindString": "Parameter", + "name": "timeout", + "type": { + "name": "timedelta | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "timedelta" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "HttpCrawlingResult", + "type": "reference", + "target": "2068" + }, + "overwrites": { + "name": "HttpClient.crawl", + "target": 2075, + "type": "reference" + } + } + ], + "overwrites": { + "name": "HttpClient.crawl", + "target": 2075, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Send an HTTP request via the client.\n\nThis method is called from `context.send_request()` helper.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2144, + "module": "http_clients._base", + "name": "send_request", + "parsedDocstring": { + "text": "Send an HTTP request via the client.\n\nThis method is called from `context.send_request()` helper.\n", + "args": { + "url": "The URL to send the request to.", + "method": "The HTTP method to use.", + "headers": "The headers to include in the request.", + "payload": "The data to be sent as the request body.", + "session": "The session associated with the request.", + "proxy_info": "The information about the proxy to be used.", + "timeout": "Maximum time allowed to process the request.\n" + }, + "returns": "The HTTP response received from the server." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/http_clients/_curl_impersonate.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 189 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "The HTTP response received from the server." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Send an HTTP request via the client.\n\nThis method is called from `context.send_request()` helper.\n" + } + ] + }, + "flags": {}, + "id": 2083, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "send_request", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The URL to send the request to." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2084, + "kind": 32768, + "kindString": "Parameter", + "name": "url", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The HTTP method to use." + } + ] + }, + "defaultValue": "'GET'", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2085, + "kind": 32768, + "kindString": "Parameter", + "name": "method", + "type": { + "name": "HttpMethod", + "type": "reference", + "target": "300" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The headers to include in the request." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2086, + "kind": 32768, + "kindString": "Parameter", + "name": "headers", + "type": { + "name": "HttpHeaders | dict[str, str] | None", + "type": "union", + "types": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "HttpHeaders", + "target": "306" + }, + { + "type": "reference", + "name": "dict", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "str" + } + ] + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The data to be sent as the request body." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2087, + "kind": 32768, + "kindString": "Parameter", + "name": "payload", + "type": { + "name": "HttpPayload | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "HttpPayload", + "target": "301" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The session associated with the request." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2088, + "kind": 32768, + "kindString": "Parameter", + "name": "session", + "type": { + "name": "Session | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Session", + "target": "2445" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The information about the proxy to be used." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2089, + "kind": 32768, + "kindString": "Parameter", + "name": "proxy_info", + "type": { + "name": "ProxyInfo | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ProxyInfo", + "target": "254" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum time allowed to process the request.\n" + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2090, + "kind": 32768, + "kindString": "Parameter", + "name": "timeout", + "type": { + "name": "timedelta | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "timedelta" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "HttpResponse", + "type": "reference", + "target": "2060" + }, + "overwrites": { + "name": "HttpClient.send_request", + "target": 2082, + "type": "reference" + } + } + ], + "overwrites": { + "name": "HttpClient.send_request", + "target": 2082, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Stream an HTTP request via the client.\n\nThis method should be used for downloading potentially large data where you need to process\nthe response body in chunks rather than loading it entirely into memory.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2153, + "module": "http_clients._base", + "name": "stream", + "parsedDocstring": { + "text": "Stream an HTTP request via the client.\n\nThis method should be used for downloading potentially large data where you need to process\nthe response body in chunks rather than loading it entirely into memory.\n", + "args": { + "url": "The URL to send the request to.", + "method": "The HTTP method to use.", + "headers": "The headers to include in the request.", + "payload": "The data to be sent as the request body.", + "session": "The session associated with the request.", + "proxy_info": "The information about the proxy to be used.", + "timeout": "The maximum time to wait for establishing the connection.\n" + }, + "returns": "An async context manager yielding the HTTP response with streaming capabilities." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/http_clients/_curl_impersonate.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 230 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "An async context manager yielding the HTTP response with streaming capabilities." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Stream an HTTP request via the client.\n\nThis method should be used for downloading potentially large data where you need to process\nthe response body in chunks rather than loading it entirely into memory.\n" + } + ] + }, + "flags": {}, + "id": 2092, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "stream", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The URL to send the request to." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2093, + "kind": 32768, + "kindString": "Parameter", + "name": "url", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The HTTP method to use." + } + ] + }, + "defaultValue": "'GET'", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2094, + "kind": 32768, + "kindString": "Parameter", + "name": "method", + "type": { + "name": "HttpMethod", + "type": "reference", + "target": "300" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The headers to include in the request." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2095, + "kind": 32768, + "kindString": "Parameter", + "name": "headers", + "type": { + "name": "HttpHeaders | dict[str, str] | None", + "type": "union", + "types": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "HttpHeaders", + "target": "306" + }, + { + "type": "reference", + "name": "dict", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "str" + } + ] + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The data to be sent as the request body." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2096, + "kind": 32768, + "kindString": "Parameter", + "name": "payload", + "type": { + "name": "HttpPayload | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "HttpPayload", + "target": "301" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The session associated with the request." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2097, + "kind": 32768, + "kindString": "Parameter", + "name": "session", + "type": { + "name": "Session | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Session", + "target": "2445" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The information about the proxy to be used." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2098, + "kind": 32768, + "kindString": "Parameter", + "name": "proxy_info", + "type": { + "name": "ProxyInfo | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ProxyInfo", + "target": "254" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The maximum time to wait for establishing the connection.\n" + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2099, + "kind": 32768, + "kindString": "Parameter", + "name": "timeout", + "type": { + "name": "timedelta | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "timedelta" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "AbstractAsyncContextManager", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "HttpResponse", + "target": "2060" + } + ] + }, + "overwrites": { + "name": "HttpClient.stream", + "target": 2091, + "type": "reference" + } + } + ], + "overwrites": { + "name": "HttpClient.stream", + "target": 2091, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Clean up resources used by the client.\n\nThis method is called when the client is no longer needed and should be overridden\nin subclasses to perform any necessary cleanup such as closing connections,\nreleasing file handles, or other resource deallocation." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2162, + "module": "http_clients._base", + "name": "cleanup", + "parsedDocstring": { + "text": "Clean up resources used by the client.\n\nThis method is called when the client is no longer needed and should be overridden\nin subclasses to perform any necessary cleanup such as closing connections,\nreleasing file handles, or other resource deallocation." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/http_clients/_curl_impersonate.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 360 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Clean up resources used by the client.\n\nThis method is called when the client is no longer needed and should be overridden\nin subclasses to perform any necessary cleanup such as closing connections,\nreleasing file handles, or other resource deallocation." + } + ] + }, + "flags": {}, + "id": 2101, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "cleanup", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "HttpClient.cleanup", + "target": 2100, + "type": "reference" + } + } + ], + "overwrites": { + "name": "HttpClient.cleanup", + "target": 2100, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Indicate whether the context is active." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 4018, + "module": "http_clients._base", + "name": "active", + "parsedDocstring": { + "text": "Indicate whether the context is active." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/http_clients/_base.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 95 + } + ], + "type": { + "name": "bool", + "type": "reference" + }, + "inheritedFrom": { + "name": "HttpClient.active", + "target": 2074, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize the client when entering the context manager.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4019, + "module": "http_clients._base", + "name": "__aenter__", + "parsedDocstring": { + "text": "Initialize the client when entering the context manager.\n" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/http_clients/_base.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 201 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize the client when entering the context manager.\n" + } + ] + }, + "flags": {}, + "id": 2103, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "__aenter__", + "parameters": [], + "type": { + "name": "HttpClient", + "type": "reference", + "target": "2070" + }, + "inheritedFrom": { + "name": "HttpClient.__aenter__", + "target": 2102, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "HttpClient.__aenter__", + "target": 2102, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Deinitialize the client and clean up resources when exiting the context manager.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4020, + "module": "http_clients._base", + "name": "__aexit__", + "parsedDocstring": { + "text": "Deinitialize the client and clean up resources when exiting the context manager.\n" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/http_clients/_base.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 213 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Deinitialize the client and clean up resources when exiting the context manager.\n" + } + ] + }, + "flags": {}, + "id": 2105, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "__aexit__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2106, + "kind": 32768, + "kindString": "Parameter", + "name": "exc_type", + "type": { + "name": "BaseException | None", + "type": "reference" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2107, + "kind": 32768, + "kindString": "Parameter", + "name": "exc_value", + "type": { + "name": "BaseException | None", + "type": "reference" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2108, + "kind": 32768, + "kindString": "Parameter", + "name": "traceback", + "type": { + "name": "TracebackType | None", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "HttpClient.__aexit__", + "target": 2104, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "HttpClient.__aexit__", + "target": 2104, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP client based on the `curl-cffi` library.\n\nThis client uses the `curl-cffi` library to perform HTTP requests in crawlers (`BasicCrawler` subclasses)\nand to manage sessions, proxies, and error handling.\n\nSee the `HttpClient` class for more common information about HTTP clients.\n\n### Usage\n\n```python\nfrom crawlee.crawlers import HttpCrawler # or any other HTTP client-based crawler\nfrom crawlee.http_clients import CurlImpersonateHttpClient\n\nhttp_client = CurlImpersonateHttpClient()\ncrawler = HttpCrawler(http_client=http_client)\n```" + } + ] + }, + "decorations": [ + { + "args": "('HTTP clients')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 4019, + 4020, + 2133, + 2162, + 2137, + 2144, + 2153 + ], + "title": "Methods" + }, + { + "children": [ + 4018 + ], + "title": "Properties" + } + ], + "id": 2132, + "module": "http_clients._curl_impersonate", + "name": "CurlImpersonateHttpClient", + "parsedDocstring": { + "text": "HTTP client based on the `curl-cffi` library.\n\nThis client uses the `curl-cffi` library to perform HTTP requests in crawlers (`BasicCrawler` subclasses)\nand to manage sessions, proxies, and error handling.\n\nSee the `HttpClient` class for more common information about HTTP clients.\n\n### Usage\n\n```python\nfrom crawlee.crawlers import HttpCrawler # or any other HTTP client-based crawler\nfrom crawlee.http_clients import CurlImpersonateHttpClient\n\nhttp_client = CurlImpersonateHttpClient()\ncrawler = HttpCrawler(http_client=http_client)\n```" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/http_clients/_curl_impersonate.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 109 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "HttpClient", + "target": "2070", + "type": "reference" + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2164, + "module": "http_clients._impit", + "name": "logger", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/http_clients/_impit.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 30 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2166, + "module": "http_clients._impit", + "name": "client", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/http_clients/_impit.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 36 + } + ], + "type": { + "name": "AsyncClient", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2167, + "module": "http_clients._impit", + "name": "cookie_jar", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/http_clients/_impit.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 37 + } + ], + "type": { + "name": "CookieJar | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "CookieJar" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Type definition for client cache entries." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 2166, + 2167 + ], + "title": "Properties" + } + ], + "id": 2165, + "module": "http_clients._impit", + "name": "_ClientCacheEntry", + "parsedDocstring": { + "text": "Type definition for client cache entries." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/http_clients/_impit.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 33 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2169, + "module": "http_clients._impit", + "name": "__init__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/http_clients/_impit.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 43 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 2170, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2171, + "kind": 32768, + "kindString": "Parameter", + "name": "response", + "type": { + "name": "Response", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 2172, + "module": "http_clients._impit", + "name": "http_version", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/http_clients/_impit.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 47 + } + ], + "type": { + "name": "str", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 2173, + "module": "http_clients._impit", + "name": "status_code", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/http_clients/_impit.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 51 + } + ], + "type": { + "name": "int", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 2174, + "module": "http_clients._impit", + "name": "headers", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/http_clients/_impit.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 55 + } + ], + "type": { + "name": "HttpHeaders", + "type": "reference", + "target": "306" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2175, + "module": "http_clients._impit", + "name": "read", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/http_clients/_impit.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 58 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 2176, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "read", + "parameters": [], + "type": { + "name": "bytes", + "type": "reference", + "target": "682" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2177, + "module": "http_clients._impit", + "name": "read_stream", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/http_clients/_impit.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 63 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 2178, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "read_stream", + "parameters": [], + "type": { + "name": "AsyncIterator", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bytes", + "target": "682" + } + ] + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Adapter class for `impit.Response` to conform to the `HttpResponse` protocol." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 2169, + 2175, + 2177 + ], + "title": "Methods" + }, + { + "children": [ + 2174, + 2172, + 2173 + ], + "title": "Properties" + } + ], + "id": 2168, + "module": "http_clients._impit", + "name": "_ImpitResponse", + "parsedDocstring": { + "text": "Adapter class for `impit.Response` to conform to the `HttpResponse` protocol." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/http_clients/_impit.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 40 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2180, + "module": "http_clients._impit", + "name": "__init__", + "parsedDocstring": { + "text": "Initialize a new instance.\n", + "args": { + "persist_cookies_per_session": "Whether to persist cookies per HTTP session.", + "http3": "Whether to enable HTTP/3 support.", + "verify": "SSL certificates used to verify the identity of requested hosts.", + "browser": "Browser to impersonate.", + "async_client_kwargs": "Additional keyword arguments for `impit.AsyncClient`." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/http_clients/_impit.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 91 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n" + } + ] + }, + "flags": {}, + "id": 2181, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Whether to persist cookies per HTTP session." + } + ] + }, + "defaultValue": "True", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2182, + "kind": 32768, + "kindString": "Parameter", + "name": "persist_cookies_per_session", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Whether to enable HTTP/3 support." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2183, + "kind": 32768, + "kindString": "Parameter", + "name": "http3", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "SSL certificates used to verify the identity of requested hosts." + } + ] + }, + "defaultValue": "True", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2184, + "kind": 32768, + "kindString": "Parameter", + "name": "verify", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Browser to impersonate." + } + ] + }, + "defaultValue": "'firefox'", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2185, + "kind": 32768, + "kindString": "Parameter", + "name": "browser", + "type": { + "name": "Browser | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Browser" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Additional keyword arguments for `impit.AsyncClient`." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2186, + "kind": 32768, + "kindString": "Parameter", + "name": "async_client_kwargs", + "type": { + "name": "Any", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "HttpClient.__init__", + "target": 2071, + "type": "reference" + } + } + ], + "overwrites": { + "name": "HttpClient.__init__", + "target": 2071, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Perform the crawling for a given request.\n\nThis method is called from `crawler.run()`.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2187, + "module": "http_clients._base", + "name": "crawl", + "parsedDocstring": { + "text": "Perform the crawling for a given request.\n\nThis method is called from `crawler.run()`.\n", + "args": { + "request": "The request to be crawled.", + "session": "The session associated with the request.", + "proxy_info": "The information about the proxy to be used.", + "statistics": "The statistics object to register status codes.", + "timeout": "Maximum time allowed to process the request.\n" + }, + "returns": "The result of the crawling." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/http_clients/_impit.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 121 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "The result of the crawling." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Perform the crawling for a given request.\n\nThis method is called from `crawler.run()`.\n" + } + ] + }, + "flags": {}, + "id": 2076, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "crawl", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The request to be crawled." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2077, + "kind": 32768, + "kindString": "Parameter", + "name": "request", + "type": { + "name": "Request", + "type": "reference", + "target": "150" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The session associated with the request." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2078, + "kind": 32768, + "kindString": "Parameter", + "name": "session", + "type": { + "name": "Session | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Session", + "target": "2445" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The information about the proxy to be used." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2079, + "kind": 32768, + "kindString": "Parameter", + "name": "proxy_info", + "type": { + "name": "ProxyInfo | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ProxyInfo", + "target": "254" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The statistics object to register status codes." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2080, + "kind": 32768, + "kindString": "Parameter", + "name": "statistics", + "type": { + "name": "Statistics | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Statistics", + "target": "2707" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum time allowed to process the request.\n" + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2081, + "kind": 32768, + "kindString": "Parameter", + "name": "timeout", + "type": { + "name": "timedelta | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "timedelta" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "HttpCrawlingResult", + "type": "reference", + "target": "2068" + }, + "overwrites": { + "name": "HttpClient.crawl", + "target": 2075, + "type": "reference" + } + } + ], + "overwrites": { + "name": "HttpClient.crawl", + "target": 2075, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Send an HTTP request via the client.\n\nThis method is called from `context.send_request()` helper.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2194, + "module": "http_clients._base", + "name": "send_request", + "parsedDocstring": { + "text": "Send an HTTP request via the client.\n\nThis method is called from `context.send_request()` helper.\n", + "args": { + "url": "The URL to send the request to.", + "method": "The HTTP method to use.", + "headers": "The headers to include in the request.", + "payload": "The data to be sent as the request body.", + "session": "The session associated with the request.", + "proxy_info": "The information about the proxy to be used.", + "timeout": "Maximum time allowed to process the request.\n" + }, + "returns": "The HTTP response received from the server." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/http_clients/_impit.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 155 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "The HTTP response received from the server." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Send an HTTP request via the client.\n\nThis method is called from `context.send_request()` helper.\n" + } + ] + }, + "flags": {}, + "id": 2083, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "send_request", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The URL to send the request to." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2084, + "kind": 32768, + "kindString": "Parameter", + "name": "url", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The HTTP method to use." + } + ] + }, + "defaultValue": "'GET'", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2085, + "kind": 32768, + "kindString": "Parameter", + "name": "method", + "type": { + "name": "HttpMethod", + "type": "reference", + "target": "300" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The headers to include in the request." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2086, + "kind": 32768, + "kindString": "Parameter", + "name": "headers", + "type": { + "name": "HttpHeaders | dict[str, str] | None", + "type": "union", + "types": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "HttpHeaders", + "target": "306" + }, + { + "type": "reference", + "name": "dict", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "str" + } + ] + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The data to be sent as the request body." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2087, + "kind": 32768, + "kindString": "Parameter", + "name": "payload", + "type": { + "name": "HttpPayload | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "HttpPayload", + "target": "301" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The session associated with the request." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2088, + "kind": 32768, + "kindString": "Parameter", + "name": "session", + "type": { + "name": "Session | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Session", + "target": "2445" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The information about the proxy to be used." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2089, + "kind": 32768, + "kindString": "Parameter", + "name": "proxy_info", + "type": { + "name": "ProxyInfo | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ProxyInfo", + "target": "254" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum time allowed to process the request.\n" + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2090, + "kind": 32768, + "kindString": "Parameter", + "name": "timeout", + "type": { + "name": "timedelta | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "timedelta" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "HttpResponse", + "type": "reference", + "target": "2060" + }, + "overwrites": { + "name": "HttpClient.send_request", + "target": 2082, + "type": "reference" + } + } + ], + "overwrites": { + "name": "HttpClient.send_request", + "target": 2082, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Stream an HTTP request via the client.\n\nThis method should be used for downloading potentially large data where you need to process\nthe response body in chunks rather than loading it entirely into memory.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2203, + "module": "http_clients._base", + "name": "stream", + "parsedDocstring": { + "text": "Stream an HTTP request via the client.\n\nThis method should be used for downloading potentially large data where you need to process\nthe response body in chunks rather than loading it entirely into memory.\n", + "args": { + "url": "The URL to send the request to.", + "method": "The HTTP method to use.", + "headers": "The headers to include in the request.", + "payload": "The data to be sent as the request body.", + "session": "The session associated with the request.", + "proxy_info": "The information about the proxy to be used.", + "timeout": "The maximum time to wait for establishing the connection.\n" + }, + "returns": "An async context manager yielding the HTTP response with streaming capabilities." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/http_clients/_impit.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 190 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "An async context manager yielding the HTTP response with streaming capabilities." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Stream an HTTP request via the client.\n\nThis method should be used for downloading potentially large data where you need to process\nthe response body in chunks rather than loading it entirely into memory.\n" + } + ] + }, + "flags": {}, + "id": 2092, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "stream", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The URL to send the request to." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2093, + "kind": 32768, + "kindString": "Parameter", + "name": "url", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The HTTP method to use." + } + ] + }, + "defaultValue": "'GET'", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2094, + "kind": 32768, + "kindString": "Parameter", + "name": "method", + "type": { + "name": "HttpMethod", + "type": "reference", + "target": "300" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The headers to include in the request." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2095, + "kind": 32768, + "kindString": "Parameter", + "name": "headers", + "type": { + "name": "HttpHeaders | dict[str, str] | None", + "type": "union", + "types": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "HttpHeaders", + "target": "306" + }, + { + "type": "reference", + "name": "dict", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "str" + } + ] + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The data to be sent as the request body." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2096, + "kind": 32768, + "kindString": "Parameter", + "name": "payload", + "type": { + "name": "HttpPayload | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "HttpPayload", + "target": "301" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The session associated with the request." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2097, + "kind": 32768, + "kindString": "Parameter", + "name": "session", + "type": { + "name": "Session | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Session", + "target": "2445" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The information about the proxy to be used." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2098, + "kind": 32768, + "kindString": "Parameter", + "name": "proxy_info", + "type": { + "name": "ProxyInfo | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ProxyInfo", + "target": "254" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The maximum time to wait for establishing the connection.\n" + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2099, + "kind": 32768, + "kindString": "Parameter", + "name": "timeout", + "type": { + "name": "timedelta | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "timedelta" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "AbstractAsyncContextManager", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "HttpResponse", + "target": "2060" + } + ] + }, + "overwrites": { + "name": "HttpClient.stream", + "target": 2091, + "type": "reference" + } + } + ], + "overwrites": { + "name": "HttpClient.stream", + "target": 2091, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Clean up resources used by the HTTP client." + } + ] + }, + "decorations": [ + { + "name": "override" + } + ], + "flags": {}, + "groups": [], + "id": 2212, + "module": "http_clients._impit", + "name": "cleanup", + "parsedDocstring": { + "text": "Clean up resources used by the HTTP client." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/http_clients/_impit.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 271 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Clean up resources used by the HTTP client." + } + ] + }, + "flags": {}, + "id": 2213, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "cleanup", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "HttpClient.cleanup", + "target": 2100, + "type": "reference" + } + } + ], + "overwrites": { + "name": "HttpClient.cleanup", + "target": 2100, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Indicate whether the context is active." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 4021, + "module": "http_clients._base", + "name": "active", + "parsedDocstring": { + "text": "Indicate whether the context is active." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/http_clients/_base.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 95 + } + ], + "type": { + "name": "bool", + "type": "reference" + }, + "inheritedFrom": { + "name": "HttpClient.active", + "target": 2074, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize the client when entering the context manager.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4022, + "module": "http_clients._base", + "name": "__aenter__", + "parsedDocstring": { + "text": "Initialize the client when entering the context manager.\n" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/http_clients/_base.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 201 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize the client when entering the context manager.\n" + } + ] + }, + "flags": {}, + "id": 2103, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "__aenter__", + "parameters": [], + "type": { + "name": "HttpClient", + "type": "reference", + "target": "2070" + }, + "inheritedFrom": { + "name": "HttpClient.__aenter__", + "target": 2102, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "HttpClient.__aenter__", + "target": 2102, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Deinitialize the client and clean up resources when exiting the context manager.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4023, + "module": "http_clients._base", + "name": "__aexit__", + "parsedDocstring": { + "text": "Deinitialize the client and clean up resources when exiting the context manager.\n" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/http_clients/_base.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 213 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Deinitialize the client and clean up resources when exiting the context manager.\n" + } + ] + }, + "flags": {}, + "id": 2105, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "__aexit__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2106, + "kind": 32768, + "kindString": "Parameter", + "name": "exc_type", + "type": { + "name": "BaseException | None", + "type": "reference" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2107, + "kind": 32768, + "kindString": "Parameter", + "name": "exc_value", + "type": { + "name": "BaseException | None", + "type": "reference" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2108, + "kind": 32768, + "kindString": "Parameter", + "name": "traceback", + "type": { + "name": "TracebackType | None", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "HttpClient.__aexit__", + "target": 2104, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "HttpClient.__aexit__", + "target": 2104, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP client based on the `impit` library.\n\nThis client uses the `impit` library to perform HTTP requests in crawlers (`BasicCrawler` subclasses)\nand to manage sessions, proxies, and error handling.\n\nSee the `HttpClient` class for more common information about HTTP clients.\n\n### Usage\n\n```python\nfrom crawlee.crawlers import HttpCrawler # or any other HTTP client-based crawler\nfrom crawlee.http_clients import ImpitHttpClient\n\nhttp_client = ImpitHttpClient()\ncrawler = HttpCrawler(http_client=http_client)\n```" + } + ] + }, + "decorations": [ + { + "args": "('HTTP clients')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 4022, + 4023, + 2180, + 2212, + 2187, + 2194, + 2203 + ], + "title": "Methods" + }, + { + "children": [ + 4021 + ], + "title": "Properties" + } + ], + "id": 2179, + "module": "http_clients._impit", + "name": "ImpitHttpClient", + "parsedDocstring": { + "text": "HTTP client based on the `impit` library.\n\nThis client uses the `impit` library to perform HTTP requests in crawlers (`BasicCrawler` subclasses)\nand to manage sessions, proxies, and error handling.\n\nSee the `HttpClient` class for more common information about HTTP clients.\n\n### Usage\n\n```python\nfrom crawlee.crawlers import HttpCrawler # or any other HTTP client-based crawler\nfrom crawlee.http_clients import ImpitHttpClient\n\nhttp_client = ImpitHttpClient()\ncrawler = HttpCrawler(http_client=http_client)\n```" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/http_clients/_impit.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 72 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "HttpClient", + "target": "2070", + "type": "reference" + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2214, + "module": "http_clients._httpx", + "name": "logger", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/http_clients/_httpx.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 29 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2216, + "module": "http_clients._httpx", + "name": "__init__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/http_clients/_httpx.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 35 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 2217, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2218, + "kind": 32768, + "kindString": "Parameter", + "name": "response", + "type": { + "name": "httpx.Response", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 2219, + "module": "http_clients._httpx", + "name": "http_version", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/http_clients/_httpx.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 39 + } + ], + "type": { + "name": "str", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 2220, + "module": "http_clients._httpx", + "name": "status_code", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/http_clients/_httpx.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 43 + } + ], + "type": { + "name": "int", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 2221, + "module": "http_clients._httpx", + "name": "headers", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/http_clients/_httpx.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 47 + } + ], + "type": { + "name": "HttpHeaders", + "type": "reference", + "target": "306" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2222, + "module": "http_clients._httpx", + "name": "read", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/http_clients/_httpx.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 50 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 2223, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "read", + "parameters": [], + "type": { + "name": "bytes", + "type": "reference", + "target": "682" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2224, + "module": "http_clients._httpx", + "name": "read_stream", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/http_clients/_httpx.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 55 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 2225, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "read_stream", + "parameters": [], + "type": { + "name": "AsyncIterator", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bytes", + "target": "682" + } + ] + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Adapter class for `httpx.Response` to conform to the `HttpResponse` protocol." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 2216, + 2222, + 2224 + ], + "title": "Methods" + }, + { + "children": [ + 2221, + 2219, + 2220 + ], + "title": "Properties" + } + ], + "id": 2215, + "module": "http_clients._httpx", + "name": "_HttpxResponse", + "parsedDocstring": { + "text": "Adapter class for `httpx.Response` to conform to the `HttpResponse` protocol." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/http_clients/_httpx.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 32 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "name": "override" + } + ], + "flags": {}, + "groups": [], + "id": 2227, + "module": "http_clients._httpx", + "name": "handle_async_request", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/http_clients/_httpx.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 72 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 2228, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "handle_async_request", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2229, + "kind": 32768, + "kindString": "Parameter", + "name": "request", + "type": { + "name": "httpx.Request", + "type": "reference" + } + } + ], + "type": { + "name": "httpx.Response", + "type": "reference" + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP transport adapter that stores response cookies in a `Session`.\n\nThis transport adapter modifies the handling of HTTP requests to update the session cookies\nbased on the response cookies, ensuring that the cookies are stored in the session object\nrather than the `HTTPX` client itself." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 2227 + ], + "title": "Async Resource Clients" + } + ], + "id": 2226, + "module": "http_clients._httpx", + "name": "_HttpxTransport", + "parsedDocstring": { + "text": "HTTP transport adapter that stores response cookies in a `Session`.\n\nThis transport adapter modifies the handling of HTTP requests to update the session cookies\nbased on the response cookies, ensuring that the cookies are stored in the session object\nrather than the `HTTPX` client itself." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/http_clients/_httpx.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 63 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2231, + "module": "http_clients._httpx", + "name": "__init__", + "parsedDocstring": { + "text": "Initialize a new instance.\n", + "args": { + "persist_cookies_per_session": "Whether to persist cookies per HTTP session.", + "http1": "Whether to enable HTTP/1.1 support.", + "http2": "Whether to enable HTTP/2 support.", + "verify": "SSL certificates used to verify the identity of requested hosts.", + "header_generator": "Header generator instance to use for generating common headers.", + "async_client_kwargs": "Additional keyword arguments for `httpx.AsyncClient`." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/http_clients/_httpx.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 107 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n" + } + ] + }, + "flags": {}, + "id": 2232, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Whether to persist cookies per HTTP session." + } + ] + }, + "defaultValue": "True", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2233, + "kind": 32768, + "kindString": "Parameter", + "name": "persist_cookies_per_session", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Whether to enable HTTP/1.1 support." + } + ] + }, + "defaultValue": "True", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2234, + "kind": 32768, + "kindString": "Parameter", + "name": "http1", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Whether to enable HTTP/2 support." + } + ] + }, + "defaultValue": "True", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2235, + "kind": 32768, + "kindString": "Parameter", + "name": "http2", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "SSL certificates used to verify the identity of requested hosts." + } + ] + }, + "defaultValue": "True", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2236, + "kind": 32768, + "kindString": "Parameter", + "name": "verify", + "type": { + "name": "str | bool | SSLContext", + "type": "union", + "types": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "bool" + } + ] + }, + { + "type": "reference", + "name": "SSLContext" + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Header generator instance to use for generating common headers." + } + ] + }, + "defaultValue": "_DEFAULT_HEADER_GENERATOR", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2237, + "kind": 32768, + "kindString": "Parameter", + "name": "header_generator", + "type": { + "name": "HeaderGenerator | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "HeaderGenerator", + "target": "1985" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Additional keyword arguments for `httpx.AsyncClient`." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2238, + "kind": 32768, + "kindString": "Parameter", + "name": "async_client_kwargs", + "type": { + "name": "Any", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "HttpClient.__init__", + "target": 2071, + "type": "reference" + } + } + ], + "overwrites": { + "name": "HttpClient.__init__", + "target": 2071, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Perform the crawling for a given request.\n\nThis method is called from `crawler.run()`.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2239, + "module": "http_clients._base", + "name": "crawl", + "parsedDocstring": { + "text": "Perform the crawling for a given request.\n\nThis method is called from `crawler.run()`.\n", + "args": { + "request": "The request to be crawled.", + "session": "The session associated with the request.", + "proxy_info": "The information about the proxy to be used.", + "statistics": "The statistics object to register status codes.", + "timeout": "Maximum time allowed to process the request.\n" + }, + "returns": "The result of the crawling." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/http_clients/_httpx.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 143 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "The result of the crawling." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Perform the crawling for a given request.\n\nThis method is called from `crawler.run()`.\n" + } + ] + }, + "flags": {}, + "id": 2076, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "crawl", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The request to be crawled." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2077, + "kind": 32768, + "kindString": "Parameter", + "name": "request", + "type": { + "name": "Request", + "type": "reference", + "target": "150" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The session associated with the request." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2078, + "kind": 32768, + "kindString": "Parameter", + "name": "session", + "type": { + "name": "Session | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Session", + "target": "2445" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The information about the proxy to be used." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2079, + "kind": 32768, + "kindString": "Parameter", + "name": "proxy_info", + "type": { + "name": "ProxyInfo | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ProxyInfo", + "target": "254" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The statistics object to register status codes." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2080, + "kind": 32768, + "kindString": "Parameter", + "name": "statistics", + "type": { + "name": "Statistics | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Statistics", + "target": "2707" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum time allowed to process the request.\n" + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2081, + "kind": 32768, + "kindString": "Parameter", + "name": "timeout", + "type": { + "name": "timedelta | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "timedelta" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "HttpCrawlingResult", + "type": "reference", + "target": "2068" + }, + "overwrites": { + "name": "HttpClient.crawl", + "target": 2075, + "type": "reference" + } + } + ], + "overwrites": { + "name": "HttpClient.crawl", + "target": 2075, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Send an HTTP request via the client.\n\nThis method is called from `context.send_request()` helper.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2246, + "module": "http_clients._base", + "name": "send_request", + "parsedDocstring": { + "text": "Send an HTTP request via the client.\n\nThis method is called from `context.send_request()` helper.\n", + "args": { + "url": "The URL to send the request to.", + "method": "The HTTP method to use.", + "headers": "The headers to include in the request.", + "payload": "The data to be sent as the request body.", + "session": "The session associated with the request.", + "proxy_info": "The information about the proxy to be used.", + "timeout": "Maximum time allowed to process the request.\n" + }, + "returns": "The HTTP response received from the server." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/http_clients/_httpx.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 184 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "The HTTP response received from the server." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Send an HTTP request via the client.\n\nThis method is called from `context.send_request()` helper.\n" + } + ] + }, + "flags": {}, + "id": 2083, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "send_request", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The URL to send the request to." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2084, + "kind": 32768, + "kindString": "Parameter", + "name": "url", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The HTTP method to use." + } + ] + }, + "defaultValue": "'GET'", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2085, + "kind": 32768, + "kindString": "Parameter", + "name": "method", + "type": { + "name": "HttpMethod", + "type": "reference", + "target": "300" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The headers to include in the request." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2086, + "kind": 32768, + "kindString": "Parameter", + "name": "headers", + "type": { + "name": "HttpHeaders | dict[str, str] | None", + "type": "union", + "types": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "HttpHeaders", + "target": "306" + }, + { + "type": "reference", + "name": "dict", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "str" + } + ] + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The data to be sent as the request body." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2087, + "kind": 32768, + "kindString": "Parameter", + "name": "payload", + "type": { + "name": "HttpPayload | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "HttpPayload", + "target": "301" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The session associated with the request." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2088, + "kind": 32768, + "kindString": "Parameter", + "name": "session", + "type": { + "name": "Session | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Session", + "target": "2445" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The information about the proxy to be used." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2089, + "kind": 32768, + "kindString": "Parameter", + "name": "proxy_info", + "type": { + "name": "ProxyInfo | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ProxyInfo", + "target": "254" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum time allowed to process the request.\n" + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2090, + "kind": 32768, + "kindString": "Parameter", + "name": "timeout", + "type": { + "name": "timedelta | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "timedelta" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "HttpResponse", + "type": "reference", + "target": "2060" + }, + "overwrites": { + "name": "HttpClient.send_request", + "target": 2082, + "type": "reference" + } + } + ], + "overwrites": { + "name": "HttpClient.send_request", + "target": 2082, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Stream an HTTP request via the client.\n\nThis method should be used for downloading potentially large data where you need to process\nthe response body in chunks rather than loading it entirely into memory.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2255, + "module": "http_clients._base", + "name": "stream", + "parsedDocstring": { + "text": "Stream an HTTP request via the client.\n\nThis method should be used for downloading potentially large data where you need to process\nthe response body in chunks rather than loading it entirely into memory.\n", + "args": { + "url": "The URL to send the request to.", + "method": "The HTTP method to use.", + "headers": "The headers to include in the request.", + "payload": "The data to be sent as the request body.", + "session": "The session associated with the request.", + "proxy_info": "The information about the proxy to be used.", + "timeout": "The maximum time to wait for establishing the connection.\n" + }, + "returns": "An async context manager yielding the HTTP response with streaming capabilities." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/http_clients/_httpx.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 220 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "An async context manager yielding the HTTP response with streaming capabilities." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Stream an HTTP request via the client.\n\nThis method should be used for downloading potentially large data where you need to process\nthe response body in chunks rather than loading it entirely into memory.\n" + } + ] + }, + "flags": {}, + "id": 2092, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "stream", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The URL to send the request to." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2093, + "kind": 32768, + "kindString": "Parameter", + "name": "url", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The HTTP method to use." + } + ] + }, + "defaultValue": "'GET'", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2094, + "kind": 32768, + "kindString": "Parameter", + "name": "method", + "type": { + "name": "HttpMethod", + "type": "reference", + "target": "300" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The headers to include in the request." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2095, + "kind": 32768, + "kindString": "Parameter", + "name": "headers", + "type": { + "name": "HttpHeaders | dict[str, str] | None", + "type": "union", + "types": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "HttpHeaders", + "target": "306" + }, + { + "type": "reference", + "name": "dict", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "str" + } + ] + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The data to be sent as the request body." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2096, + "kind": 32768, + "kindString": "Parameter", + "name": "payload", + "type": { + "name": "HttpPayload | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "HttpPayload", + "target": "301" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The session associated with the request." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2097, + "kind": 32768, + "kindString": "Parameter", + "name": "session", + "type": { + "name": "Session | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Session", + "target": "2445" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The information about the proxy to be used." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2098, + "kind": 32768, + "kindString": "Parameter", + "name": "proxy_info", + "type": { + "name": "ProxyInfo | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ProxyInfo", + "target": "254" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The maximum time to wait for establishing the connection.\n" + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2099, + "kind": 32768, + "kindString": "Parameter", + "name": "timeout", + "type": { + "name": "timedelta | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "timedelta" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "AbstractAsyncContextManager", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "HttpResponse", + "target": "2060" + } + ] + }, + "overwrites": { + "name": "HttpClient.stream", + "target": 2091, + "type": "reference" + } + } + ], + "overwrites": { + "name": "HttpClient.stream", + "target": 2091, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Clean up resources used by the client.\n\nThis method is called when the client is no longer needed and should be overridden\nin subclasses to perform any necessary cleanup such as closing connections,\nreleasing file handles, or other resource deallocation." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2264, + "module": "http_clients._base", + "name": "cleanup", + "parsedDocstring": { + "text": "Clean up resources used by the client.\n\nThis method is called when the client is no longer needed and should be overridden\nin subclasses to perform any necessary cleanup such as closing connections,\nreleasing file handles, or other resource deallocation." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/http_clients/_httpx.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 349 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Clean up resources used by the client.\n\nThis method is called when the client is no longer needed and should be overridden\nin subclasses to perform any necessary cleanup such as closing connections,\nreleasing file handles, or other resource deallocation." + } + ] + }, + "flags": {}, + "id": 2101, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "cleanup", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "HttpClient.cleanup", + "target": 2100, + "type": "reference" + } + } + ], + "overwrites": { + "name": "HttpClient.cleanup", + "target": 2100, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Indicate whether the context is active." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 4024, + "module": "http_clients._base", + "name": "active", + "parsedDocstring": { + "text": "Indicate whether the context is active." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/http_clients/_base.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 95 + } + ], + "type": { + "name": "bool", + "type": "reference" + }, + "inheritedFrom": { + "name": "HttpClient.active", + "target": 2074, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize the client when entering the context manager.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4025, + "module": "http_clients._base", + "name": "__aenter__", + "parsedDocstring": { + "text": "Initialize the client when entering the context manager.\n" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/http_clients/_base.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 201 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize the client when entering the context manager.\n" + } + ] + }, + "flags": {}, + "id": 2103, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "__aenter__", + "parameters": [], + "type": { + "name": "HttpClient", + "type": "reference", + "target": "2070" + }, + "inheritedFrom": { + "name": "HttpClient.__aenter__", + "target": 2102, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "HttpClient.__aenter__", + "target": 2102, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Deinitialize the client and clean up resources when exiting the context manager.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4026, + "module": "http_clients._base", + "name": "__aexit__", + "parsedDocstring": { + "text": "Deinitialize the client and clean up resources when exiting the context manager.\n" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/http_clients/_base.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 213 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Deinitialize the client and clean up resources when exiting the context manager.\n" + } + ] + }, + "flags": {}, + "id": 2105, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "__aexit__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2106, + "kind": 32768, + "kindString": "Parameter", + "name": "exc_type", + "type": { + "name": "BaseException | None", + "type": "reference" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2107, + "kind": 32768, + "kindString": "Parameter", + "name": "exc_value", + "type": { + "name": "BaseException | None", + "type": "reference" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2108, + "kind": 32768, + "kindString": "Parameter", + "name": "traceback", + "type": { + "name": "TracebackType | None", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "reference" + }, + "inheritedFrom": { + "name": "HttpClient.__aexit__", + "target": 2104, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "HttpClient.__aexit__", + "target": 2104, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP client based on the `HTTPX` library.\n\nThis client uses the `HTTPX` library to perform HTTP requests in crawlers (`BasicCrawler` subclasses)\nand to manage sessions, proxies, and error handling.\n\nSee the `HttpClient` class for more common information about HTTP clients.\n\n### Usage\n\n```python\nfrom crawlee.crawlers import HttpCrawler # or any other HTTP client-based crawler\nfrom crawlee.http_clients import HttpxHttpClient\n\nhttp_client = HttpxHttpClient()\ncrawler = HttpCrawler(http_client=http_client)\n```" + } + ] + }, + "decorations": [ + { + "args": "('HTTP clients')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 4025, + 4026, + 2231, + 2264, + 2239, + 2246, + 2255 + ], + "title": "Methods" + }, + { + "children": [ + 4024 + ], + "title": "Properties" + } + ], + "id": 2230, + "module": "http_clients._httpx", + "name": "HttpxHttpClient", + "parsedDocstring": { + "text": "HTTP client based on the `HTTPX` library.\n\nThis client uses the `HTTPX` library to perform HTTP requests in crawlers (`BasicCrawler` subclasses)\nand to manage sessions, proxies, and error handling.\n\nSee the `HttpClient` class for more common information about HTTP clients.\n\n### Usage\n\n```python\nfrom crawlee.crawlers import HttpCrawler # or any other HTTP client-based crawler\nfrom crawlee.http_clients import HttpxHttpClient\n\nhttp_client = HttpxHttpClient()\ncrawler = HttpCrawler(http_client=http_client)\n```" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/http_clients/_httpx.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 86 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "HttpClient", + "target": "2070", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize the instrumentor.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2267, + "module": "otel.crawler_instrumentor", + "name": "__init__", + "parsedDocstring": { + "text": "Initialize the instrumentor.\n", + "args": { + "instrument_classes": "List of classes to be instrumented - all their public methods and coroutines will be\nwrapped by generic instrumentation wrapper that will create spans for them.", + "request_handling_instrumentation": "When `True`, the most relevant methods in the request handling pipeline\nwill be instrumented. When `False`, no request handling instrumentation will be done." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/otel/crawler_instrumentor.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 28 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize the instrumentor.\n" + } + ] + }, + "flags": {}, + "id": 2268, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "List of classes to be instrumented - all their public methods and coroutines will be\nwrapped by generic instrumentation wrapper that will create spans for them." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2269, + "kind": 32768, + "kindString": "Parameter", + "name": "instrument_classes", + "type": { + "name": "list[type] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "type", + "target": "981" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "When `True`, the most relevant methods in the request handling pipeline\nwill be instrumented. When `False`, no request handling instrumentation will be done." + } + ] + }, + "defaultValue": "True", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2270, + "kind": 32768, + "kindString": "Parameter", + "name": "request_handling_instrumentation", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return a list of python packages with versions that will be instrumented." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2271, + "module": "otel.crawler_instrumentor", + "name": "instrumentation_dependencies", + "parsedDocstring": { + "text": "Return a list of python packages with versions that will be instrumented." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/otel/crawler_instrumentor.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 120 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return a list of python packages with versions that will be instrumented." + } + ] + }, + "flags": {}, + "id": 2272, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "instrumentation_dependencies", + "parameters": [], + "type": { + "name": "list", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Helper class for instrumenting crawlers with OpenTelemetry." + } + ] + }, + "decorations": [ + { + "args": "('Other')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 2267, + 2271 + ], + "title": "Methods" + } + ], + "id": 2266, + "module": "otel.crawler_instrumentor", + "name": "CrawlerInstrumentor", + "parsedDocstring": { + "text": "Helper class for instrumenting crawlers with OpenTelemetry." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/otel/crawler_instrumentor.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 25 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the number of requests in the loader that have been handled." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2274, + "module": "request_loaders._request_loader", + "name": "get_handled_count", + "parsedDocstring": { + "text": "Get the number of requests in the loader that have been handled." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/request_loaders/_request_loader.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 29 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the number of requests in the loader that have been handled." + } + ] + }, + "flags": {}, + "id": 2275, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_handled_count", + "parameters": [], + "type": { + "name": "int", + "type": "reference" + }, + "overwrites": { + "name": "RequestLoader.get_handled_count", + "target": 2274, + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get an offline approximation of the total number of requests in the loader (i.e. pending + handled)." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2276, + "module": "request_loaders._request_loader", + "name": "get_total_count", + "parsedDocstring": { + "text": "Get an offline approximation of the total number of requests in the loader (i.e. pending + handled)." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/request_loaders/_request_loader.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 33 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get an offline approximation of the total number of requests in the loader (i.e. pending + handled)." + } + ] + }, + "flags": {}, + "id": 2277, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_total_count", + "parameters": [], + "type": { + "name": "int", + "type": "reference" + }, + "overwrites": { + "name": "RequestLoader.get_total_count", + "target": 2276, + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return True if there are no more requests in the loader (there might still be unfinished requests)." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2278, + "module": "request_loaders._request_loader", + "name": "is_empty", + "parsedDocstring": { + "text": "Return True if there are no more requests in the loader (there might still be unfinished requests)." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/request_loaders/_request_loader.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 37 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return True if there are no more requests in the loader (there might still be unfinished requests)." + } + ] + }, + "flags": {}, + "id": 2279, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "is_empty", + "parameters": [], + "type": { + "name": "bool", + "type": "reference" + }, + "overwrites": { + "name": "RequestLoader.is_empty", + "target": 2278, + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return True if all requests have been handled." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2280, + "module": "request_loaders._request_loader", + "name": "is_finished", + "parsedDocstring": { + "text": "Return True if all requests have been handled." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/request_loaders/_request_loader.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 41 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return True if all requests have been handled." + } + ] + }, + "flags": {}, + "id": 2281, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "is_finished", + "parameters": [], + "type": { + "name": "bool", + "type": "reference" + }, + "overwrites": { + "name": "RequestLoader.is_finished", + "target": 2280, + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the next request to be processed, or `None` if there are no more pending requests.\n\nThe method should return `None` if and only if `is_finished` would return `True`. In other cases, the method\nshould wait until a request appears." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2282, + "module": "request_loaders._request_loader", + "name": "fetch_next_request", + "parsedDocstring": { + "text": "Return the next request to be processed, or `None` if there are no more pending requests.\n\nThe method should return `None` if and only if `is_finished` would return `True`. In other cases, the method\nshould wait until a request appears." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/request_loaders/_request_loader.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 45 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the next request to be processed, or `None` if there are no more pending requests.\n\nThe method should return `None` if and only if `is_finished` would return `True`. In other cases, the method\nshould wait until a request appears." + } + ] + }, + "flags": {}, + "id": 2283, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "fetch_next_request", + "parameters": [], + "type": { + "name": "Request | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Request", + "target": "150" + }, + { + "type": "literal", + "value": null + } + ] + }, + "overwrites": { + "name": "RequestLoader.fetch_next_request", + "target": 2282, + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Mark a request as handled after a successful processing (or after giving up retrying)." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2284, + "module": "request_loaders._request_loader", + "name": "mark_request_as_handled", + "parsedDocstring": { + "text": "Mark a request as handled after a successful processing (or after giving up retrying)." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/request_loaders/_request_loader.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 53 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Mark a request as handled after a successful processing (or after giving up retrying)." + } + ] + }, + "flags": {}, + "id": 2285, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "mark_request_as_handled", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2286, + "kind": 32768, + "kindString": "Parameter", + "name": "request", + "type": { + "name": "Request", + "type": "reference", + "target": "150" + } + } + ], + "type": { + "name": "ProcessedRequest | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ProcessedRequest", + "target": "3665" + }, + { + "type": "literal", + "value": null + } + ] + }, + "overwrites": { + "name": "RequestLoader.mark_request_as_handled", + "target": 2284, + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Combine the loader with a request manager to support adding and reclaiming requests.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2287, + "module": "request_loaders._request_loader", + "name": "to_tandem", + "parsedDocstring": { + "text": "Combine the loader with a request manager to support adding and reclaiming requests.\n", + "args": { + "request_manager": "Request manager to combine the loader with.\nIf None is given, the default request queue is used." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/request_loaders/_request_loader.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 56 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Combine the loader with a request manager to support adding and reclaiming requests.\n" + } + ] + }, + "flags": {}, + "id": 2288, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "to_tandem", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Request manager to combine the loader with.\nIf None is given, the default request queue is used." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 2289, + "kind": 32768, + "kindString": "Parameter", + "name": "request_manager", + "type": { + "name": "RequestManager | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "RequestManager", + "target": "2339" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "RequestManagerTandem", + "type": "reference", + "target": "2359" + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "An abstract class defining the interface for classes that provide access to a read-only stream of requests.\n\nRequest loaders are used to manage and provide access to a storage of crawling requests.\n\nKey responsibilities:\n- Fetching the next request to be processed.\n- Marking requests as successfully handled after processing.\n- Managing state information such as the total and handled request counts." + } + ] + }, + "decorations": [ + { + "args": "('Request loaders')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 2282, + 2274, + 2276, + 2278, + 2280, + 2284, + 2287 + ], + "title": "Methods" + } + ], + "id": 2273, + "module": "request_loaders._request_loader", + "name": "RequestLoader", + "parsedDocstring": { + "text": "An abstract class defining the interface for classes that provide access to a read-only stream of requests.\n\nRequest loaders are used to manage and provide access to a storage of crawling requests.\n\nKey responsibilities:\n- Fetching the next request to be processed.\n- Marking requests as successfully handled after processing.\n- Managing state information such as the total and handled request counts." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/request_loaders/_request_loader.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 17 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedBy": [ + { + "name": "SitemapRequestLoader", + "target": "2302", + "type": "reference" + }, + { + "name": "RequestManager", + "target": "2339", + "type": "reference" + }, + { + "name": "RequestList", + "target": "2403", + "type": "reference" + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2290, + "module": "request_loaders._sitemap_request_loader", + "name": "logger", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/request_loaders/_sitemap_request_loader.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 30 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2292, + "module": "request_loaders._sitemap_request_loader", + "name": "model_config", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/request_loaders/_sitemap_request_loader.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 60 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Queue of URLs extracted from sitemaps and ready for processing." + } + ] + }, + "flags": {}, + "groups": [], + "id": 2293, + "module": "request_loaders._sitemap_request_loader", + "name": "url_queue", + "parsedDocstring": { + "text": "Queue of URLs extracted from sitemaps and ready for processing." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/request_loaders/_sitemap_request_loader.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 62 + } + ], + "type": { + "name": "deque", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Set of request URLs currently being processed." + } + ] + }, + "flags": {}, + "groups": [], + "id": 2294, + "module": "request_loaders._sitemap_request_loader", + "name": "in_progress", + "parsedDocstring": { + "text": "Set of request URLs currently being processed." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/request_loaders/_sitemap_request_loader.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 65 + } + ], + "type": { + "name": "set", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ], + "target": "2562" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Queue of sitemap URLs that need to be fetched and processed." + } + ] + }, + "flags": {}, + "groups": [], + "id": 2295, + "module": "request_loaders._sitemap_request_loader", + "name": "pending_sitemap_urls", + "parsedDocstring": { + "text": "Queue of sitemap URLs that need to be fetched and processed." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/request_loaders/_sitemap_request_loader.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 68 + } + ], + "type": { + "name": "deque", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The sitemap URL currently being processed." + } + ] + }, + "flags": {}, + "groups": [], + "id": 2296, + "module": "request_loaders._sitemap_request_loader", + "name": "in_progress_sitemap_url", + "parsedDocstring": { + "text": "The sitemap URL currently being processed." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/request_loaders/_sitemap_request_loader.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 71 + } + ], + "type": { + "name": "Annotated[str | None, Field(alias='inProgressSitemapUrl')]", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "URLs from the current sitemap that have been added to the queue." + } + ] + }, + "flags": {}, + "groups": [], + "id": 2297, + "module": "request_loaders._sitemap_request_loader", + "name": "current_sitemap_processed_urls", + "parsedDocstring": { + "text": "URLs from the current sitemap that have been added to the queue." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/request_loaders/_sitemap_request_loader.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 74 + } + ], + "type": { + "name": "set", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ], + "target": "2562" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Set of processed sitemap URLs." + } + ] + }, + "flags": {}, + "groups": [], + "id": 2298, + "module": "request_loaders._sitemap_request_loader", + "name": "processed_sitemap_urls", + "parsedDocstring": { + "text": "Set of processed sitemap URLs." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/request_loaders/_sitemap_request_loader.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 77 + } + ], + "type": { + "name": "set", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ], + "target": "2562" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Whether all sitemaps have been fully processed." + } + ] + }, + "flags": {}, + "groups": [], + "id": 2299, + "module": "request_loaders._sitemap_request_loader", + "name": "completed", + "parsedDocstring": { + "text": "Whether all sitemaps have been fully processed." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/request_loaders/_sitemap_request_loader.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 80 + } + ], + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Total number of URLs found and added to the queue from all processed sitemaps." + } + ] + }, + "flags": {}, + "groups": [], + "id": 2300, + "module": "request_loaders._sitemap_request_loader", + "name": "total_count", + "parsedDocstring": { + "text": "Total number of URLs found and added to the queue from all processed sitemaps." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/request_loaders/_sitemap_request_loader.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 83 + } + ], + "type": { + "name": "int", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Number of URLs that have been successfully handled." + } + ] + }, + "flags": {}, + "groups": [], + "id": 2301, + "module": "request_loaders._sitemap_request_loader", + "name": "handled_count", + "parsedDocstring": { + "text": "Number of URLs that have been successfully handled." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/request_loaders/_sitemap_request_loader.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 86 + } + ], + "type": { + "name": "int", + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "State model for persisting sitemap request loader data.\n\nThe crawler processes one sitemap at a time. The current sitemap is stored in `in_progress_sitemap_url`.\nThe `parse_sitemap` function parses the sitemap and returns elements as an async iterator. Each element retrieved\nfrom the iterator is processed based on its type. If the element is a `NestedSitemap`, its URL is added to\n`pending_sitemap_urls` if it hasn't been processed yet (not in `processed_sitemap_urls`). If the element is a\n`SitemapUrl`, the system checks whether it already exists in `current_sitemap_processed_urls`. If it exists,\nthe loader was restarted from a saved state and the URL is skipped.\n\nIf the URL is new, it is first added to `url_queue`, then to `current_sitemap_processed_urls`, and `total_count` is\nincremented by 1. When all elements from the current sitemap iterator have been processed, `in_progress_sitemap_url`\nis set to `None`, the sitemap URL is added to `processed_sitemap_urls`, and `current_sitemap_processed_urls` is\ncleared. The next sitemap is retrieved from `pending_sitemap_urls`, skipping any URLs that already exist in\n`processed_sitemap_urls`. If `pending_sitemap_urls` is empty, `completed` is set to `True`.\n\nWhen `fetch_next_request` is called, a URL is extracted from `url_queue` and placed in `in_progress`.\nWhen `mark_request_as_handled` is called for the extracted URL, it is removed from `in_progress` and\n`handled_count` is incremented by 1.\n\nDuring initial startup or restart after persistence, state validation occurs in `_get_state`. If both\n`pending_sitemap_urls` and `in_progress_sitemap_url` are empty and `completed` is False, this indicates a\nfresh start. In this case, `self._sitemap_urls` are moved to `pending_sitemap_urls`. Otherwise, the system is\nrestarting from a persisted state. If `in_progress` contains any URLs, they are moved back to `url_queue` and\n`in_progress` is cleared." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 2299, + 2297, + 2301, + 2294, + 2296, + 2292, + 2295, + 2298, + 2300, + 2293 + ], + "title": "Properties" + } + ], + "id": 2291, + "module": "request_loaders._sitemap_request_loader", + "name": "SitemapRequestLoaderState", + "parsedDocstring": { + "text": "State model for persisting sitemap request loader data.\n\nThe crawler processes one sitemap at a time. The current sitemap is stored in `in_progress_sitemap_url`.\nThe `parse_sitemap` function parses the sitemap and returns elements as an async iterator. Each element retrieved\nfrom the iterator is processed based on its type. If the element is a `NestedSitemap`, its URL is added to\n`pending_sitemap_urls` if it hasn't been processed yet (not in `processed_sitemap_urls`). If the element is a\n`SitemapUrl`, the system checks whether it already exists in `current_sitemap_processed_urls`. If it exists,\nthe loader was restarted from a saved state and the URL is skipped.\n\nIf the URL is new, it is first added to `url_queue`, then to `current_sitemap_processed_urls`, and `total_count` is\nincremented by 1. When all elements from the current sitemap iterator have been processed, `in_progress_sitemap_url`\nis set to `None`, the sitemap URL is added to `processed_sitemap_urls`, and `current_sitemap_processed_urls` is\ncleared. The next sitemap is retrieved from `pending_sitemap_urls`, skipping any URLs that already exist in\n`processed_sitemap_urls`. If `pending_sitemap_urls` is empty, `completed` is set to `True`.\n\nWhen `fetch_next_request` is called, a URL is extracted from `url_queue` and placed in `in_progress`.\nWhen `mark_request_as_handled` is called for the extracted URL, it is removed from `in_progress` and\n`handled_count` is incremented by 1.\n\nDuring initial startup or restart after persistence, state validation occurs in `_get_state`. If both\n`pending_sitemap_urls` and `in_progress_sitemap_url` are empty and `completed` is False, this indicates a\nfresh start. In this case, `self._sitemap_urls` are moved to `pending_sitemap_urls`. Otherwise, the system is\nrestarting from a persisted state. If `in_progress` contains any URLs, they are moved back to `url_queue` and\n`in_progress` is cleared." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/request_loaders/_sitemap_request_loader.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 33 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize the sitemap request loader.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2303, + "module": "request_loaders._sitemap_request_loader", + "name": "__init__", + "parsedDocstring": { + "text": "Initialize the sitemap request loader.\n", + "args": { + "sitemap_urls": "Configuration options for the loader.", + "proxy_info": "Optional proxy to use for fetching sitemaps.", + "include": "List of glob or regex patterns to include URLs.", + "exclude": "List of glob or regex patterns to exclude URLs.", + "max_buffer_size": "Maximum number of URLs to buffer in memory.", + "http_client": "the instance of `HttpClient` to use for fetching sitemaps.", + "persist_state_key": "A key for persisting the loader's state in the KeyValueStore.\nWhen provided, allows resuming from where it left off after interruption.\nIf None, no state persistence occurs.", + "transform_request_function": "An optional function to transform requests\ngenerated by the loader. It receives `RequestOptions` with `url` and should return either\nmodified `RequestOptions` or a `RequestTransformAction`." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/request_loaders/_sitemap_request_loader.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 106 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize the sitemap request loader.\n" + } + ] + }, + "flags": {}, + "id": 2304, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Configuration options for the loader." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2305, + "kind": 32768, + "kindString": "Parameter", + "name": "sitemap_urls", + "type": { + "name": "list", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "the instance of `HttpClient` to use for fetching sitemaps." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2306, + "kind": 32768, + "kindString": "Parameter", + "name": "http_client", + "type": { + "name": "HttpClient", + "type": "reference", + "target": "2070" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Optional proxy to use for fetching sitemaps." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2307, + "kind": 32768, + "kindString": "Parameter", + "name": "proxy_info", + "type": { + "name": "ProxyInfo | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ProxyInfo", + "target": "254" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "List of glob or regex patterns to include URLs." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2308, + "kind": 32768, + "kindString": "Parameter", + "name": "include", + "type": { + "name": "list[re.Pattern[Any] | Glob] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "re.Pattern", + "typeArguments": [ + { + "type": "reference", + "name": "Any" + } + ] + }, + { + "type": "reference", + "name": "Glob", + "target": "784" + } + ] + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "List of glob or regex patterns to exclude URLs." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2309, + "kind": 32768, + "kindString": "Parameter", + "name": "exclude", + "type": { + "name": "list[re.Pattern[Any] | Glob] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "re.Pattern", + "typeArguments": [ + { + "type": "reference", + "name": "Any" + } + ] + }, + { + "type": "reference", + "name": "Glob", + "target": "784" + } + ] + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum number of URLs to buffer in memory." + } + ] + }, + "defaultValue": "200", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2310, + "kind": 32768, + "kindString": "Parameter", + "name": "max_buffer_size", + "type": { + "name": "int", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A key for persisting the loader's state in the KeyValueStore.\nWhen provided, allows resuming from where it left off after interruption.\nIf None, no state persistence occurs." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2311, + "kind": 32768, + "kindString": "Parameter", + "name": "persist_state_key", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "An optional function to transform requests\ngenerated by the loader. It receives `RequestOptions` with `url` and should return either\nmodified `RequestOptions` or a `RequestTransformAction`." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2312, + "kind": 32768, + "kindString": "Parameter", + "name": "transform_request_function", + "type": { + "name": "Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Callable", + "typeArguments": [ + { + "type": "reference", + "name": "[RequestOptions]" + }, + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "RequestOptions", + "target": "134" + }, + { + "type": "reference", + "name": "RequestTransformAction", + "target": "302" + } + ] + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the total number of URLs found so far." + } + ] + }, + "decorations": [ + { + "name": "override" + } + ], + "flags": {}, + "groups": [], + "id": 2313, + "module": "request_loaders._sitemap_request_loader", + "name": "get_total_count", + "parsedDocstring": { + "text": "Return the total number of URLs found so far." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/request_loaders/_sitemap_request_loader.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 288 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the total number of URLs found so far." + } + ] + }, + "flags": {}, + "id": 2314, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_total_count", + "parameters": [], + "type": { + "name": "int", + "type": "reference" + }, + "overwrites": { + "name": "RequestLoader.get_total_count", + "target": 2276, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RequestLoader.get_total_count", + "target": 2276, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the number of URLs that have been handled." + } + ] + }, + "decorations": [ + { + "name": "override" + } + ], + "flags": {}, + "groups": [], + "id": 2315, + "module": "request_loaders._sitemap_request_loader", + "name": "get_handled_count", + "parsedDocstring": { + "text": "Return the number of URLs that have been handled." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/request_loaders/_sitemap_request_loader.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 294 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the number of URLs that have been handled." + } + ] + }, + "flags": {}, + "id": 2316, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_handled_count", + "parameters": [], + "type": { + "name": "int", + "type": "reference" + }, + "overwrites": { + "name": "RequestLoader.get_handled_count", + "target": 2274, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RequestLoader.get_handled_count", + "target": 2274, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Check if there are no more URLs to process." + } + ] + }, + "decorations": [ + { + "name": "override" + } + ], + "flags": {}, + "groups": [], + "id": 2317, + "module": "request_loaders._sitemap_request_loader", + "name": "is_empty", + "parsedDocstring": { + "text": "Check if there are no more URLs to process." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/request_loaders/_sitemap_request_loader.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 300 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Check if there are no more URLs to process." + } + ] + }, + "flags": {}, + "id": 2318, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "is_empty", + "parameters": [], + "type": { + "name": "bool", + "type": "reference" + }, + "overwrites": { + "name": "RequestLoader.is_empty", + "target": 2278, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RequestLoader.is_empty", + "target": 2278, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Check if all URLs have been processed." + } + ] + }, + "decorations": [ + { + "name": "override" + } + ], + "flags": {}, + "groups": [], + "id": 2319, + "module": "request_loaders._sitemap_request_loader", + "name": "is_finished", + "parsedDocstring": { + "text": "Check if all URLs have been processed." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/request_loaders/_sitemap_request_loader.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 306 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Check if all URLs have been processed." + } + ] + }, + "flags": {}, + "id": 2320, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "is_finished", + "parameters": [], + "type": { + "name": "bool", + "type": "reference" + }, + "overwrites": { + "name": "RequestLoader.is_finished", + "target": 2280, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RequestLoader.is_finished", + "target": 2280, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Fetch the next request to process." + } + ] + }, + "decorations": [ + { + "name": "override" + } + ], + "flags": {}, + "groups": [], + "id": 2321, + "module": "request_loaders._sitemap_request_loader", + "name": "fetch_next_request", + "parsedDocstring": { + "text": "Fetch the next request to process." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/request_loaders/_sitemap_request_loader.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 312 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Fetch the next request to process." + } + ] + }, + "flags": {}, + "id": 2322, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "fetch_next_request", + "parameters": [], + "type": { + "name": "Request | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Request", + "target": "150" + }, + { + "type": "literal", + "value": null + } + ] + }, + "overwrites": { + "name": "RequestLoader.fetch_next_request", + "target": 2282, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RequestLoader.fetch_next_request", + "target": 2282, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Mark a request as successfully handled." + } + ] + }, + "decorations": [ + { + "name": "override" + } + ], + "flags": {}, + "groups": [], + "id": 2323, + "module": "request_loaders._sitemap_request_loader", + "name": "mark_request_as_handled", + "parsedDocstring": { + "text": "Mark a request as successfully handled." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/request_loaders/_sitemap_request_loader.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 340 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Mark a request as successfully handled." + } + ] + }, + "flags": {}, + "id": 2324, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "mark_request_as_handled", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2325, + "kind": 32768, + "kindString": "Parameter", + "name": "request", + "type": { + "name": "Request", + "type": "reference", + "target": "150" + } + } + ], + "type": { + "name": "ProcessedRequest | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ProcessedRequest", + "target": "3665" + }, + { + "type": "literal", + "value": null + } + ] + }, + "overwrites": { + "name": "RequestLoader.mark_request_as_handled", + "target": 2284, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RequestLoader.mark_request_as_handled", + "target": 2284, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Abort the sitemap loading process." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2326, + "module": "request_loaders._sitemap_request_loader", + "name": "abort_loading", + "parsedDocstring": { + "text": "Abort the sitemap loading process." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/request_loaders/_sitemap_request_loader.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 348 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Abort the sitemap loading process." + } + ] + }, + "flags": {}, + "id": 2327, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "abort_loading", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Start the sitemap loading process." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2328, + "module": "request_loaders._sitemap_request_loader", + "name": "start", + "parsedDocstring": { + "text": "Start the sitemap loading process." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/request_loaders/_sitemap_request_loader.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 355 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Start the sitemap loading process." + } + ] + }, + "flags": {}, + "id": 2329, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "start", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Close the request loader." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2330, + "module": "request_loaders._sitemap_request_loader", + "name": "close", + "parsedDocstring": { + "text": "Close the request loader." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/request_loaders/_sitemap_request_loader.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 361 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Close the request loader." + } + ] + }, + "flags": {}, + "id": 2331, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "close", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Enter the context manager." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2332, + "module": "request_loaders._sitemap_request_loader", + "name": "__aenter__", + "parsedDocstring": { + "text": "Enter the context manager." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/request_loaders/_sitemap_request_loader.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 366 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Enter the context manager." + } + ] + }, + "flags": {}, + "id": 2333, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "__aenter__", + "parameters": [], + "type": { + "name": "SitemapRequestLoader", + "type": "reference", + "target": "2302" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Exit the context manager." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2334, + "module": "request_loaders._sitemap_request_loader", + "name": "__aexit__", + "parsedDocstring": { + "text": "Exit the context manager." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/request_loaders/_sitemap_request_loader.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 371 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Exit the context manager." + } + ] + }, + "flags": {}, + "id": 2335, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "__aexit__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2336, + "kind": 32768, + "kindString": "Parameter", + "name": "exc_type", + "type": { + "name": "type[BaseException] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "type", + "typeArguments": [ + { + "type": "reference", + "name": "BaseException" + } + ], + "target": "981" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2337, + "kind": 32768, + "kindString": "Parameter", + "name": "exc_value", + "type": { + "name": "BaseException | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "BaseException" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2338, + "kind": 32768, + "kindString": "Parameter", + "name": "exc_traceback", + "type": { + "name": "TracebackType | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "TracebackType" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Combine the loader with a request manager to support adding and reclaiming requests.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3994, + "module": "request_loaders._request_loader", + "name": "to_tandem", + "parsedDocstring": { + "text": "Combine the loader with a request manager to support adding and reclaiming requests.\n", + "args": { + "request_manager": "Request manager to combine the loader with.\nIf None is given, the default request queue is used." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/request_loaders/_request_loader.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 56 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Combine the loader with a request manager to support adding and reclaiming requests.\n" + } + ] + }, + "flags": {}, + "id": 2288, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "to_tandem", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Request manager to combine the loader with.\nIf None is given, the default request queue is used." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 2289, + "kind": 32768, + "kindString": "Parameter", + "name": "request_manager", + "type": { + "name": "RequestManager | None", + "type": "reference" + } + } + ], + "type": { + "name": "RequestManagerTandem", + "type": "reference", + "target": "2359" + }, + "inheritedFrom": { + "name": "RequestLoader.to_tandem", + "target": 2287, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "RequestLoader.to_tandem", + "target": 2287, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A request loader that reads URLs from sitemap(s).\n\nThe loader is designed to handle sitemaps that follow the format described in the Sitemaps protocol\n(https://www.sitemaps.org/protocol.html). It supports both XML and plain text sitemap formats.\nNote that HTML pages containing links are not supported - those should be handled by regular crawlers\nand the `enqueue_links` functionality.\n\nThe loader fetches and parses sitemaps in the background, allowing crawling to start\nbefore all URLs are loaded. It supports filtering URLs using glob and regex patterns.\n\nThe loader supports state persistence, allowing it to resume from where it left off\nafter interruption when a `persist_state_key` is provided during initialization." + } + ] + }, + "decorations": [ + { + "args": "('Request loaders')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 2332, + 2334, + 2303, + 2326, + 2330, + 2321, + 2315, + 2313, + 2317, + 2319, + 2323, + 2328, + 3994 + ], + "title": "Methods" + } + ], + "id": 2302, + "module": "request_loaders._sitemap_request_loader", + "name": "SitemapRequestLoader", + "parsedDocstring": { + "text": "A request loader that reads URLs from sitemap(s).\n\nThe loader is designed to handle sitemaps that follow the format described in the Sitemaps protocol\n(https://www.sitemaps.org/protocol.html). It supports both XML and plain text sitemap formats.\nNote that HTML pages containing links are not supported - those should be handled by regular crawlers\nand the `enqueue_links` functionality.\n\nThe loader fetches and parses sitemaps in the background, allowing crawling to start\nbefore all URLs are loaded. It supports filtering URLs using glob and regex patterns.\n\nThe loader supports state persistence, allowing it to resume from where it left off\nafter interruption when a `persist_state_key` is provided during initialization." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/request_loaders/_sitemap_request_loader.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 91 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "RequestLoader", + "target": "2273", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Remove persistent state either from the Apify Cloud storage or from the local database." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2340, + "module": "request_loaders._request_manager", + "name": "drop", + "parsedDocstring": { + "text": "Remove persistent state either from the Apify Cloud storage or from the local database." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/request_loaders/_request_manager.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 22 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Remove persistent state either from the Apify Cloud storage or from the local database." + } + ] + }, + "flags": {}, + "id": 2341, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "drop", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "RequestManager.drop", + "target": 2340, + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Add a single request to the manager and store it in underlying resource client.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2342, + "module": "request_loaders._request_manager", + "name": "add_request", + "parsedDocstring": { + "text": "Add a single request to the manager and store it in underlying resource client.\n", + "args": { + "request": "The request object (or its string representation) to be added to the manager.", + "forefront": "Determines whether the request should be added to the beginning (if True) or the end (if False)\nof the manager.\n" + }, + "returns": "Information about the request addition to the manager or None if the request was not added." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/request_loaders/_request_manager.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 26 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "Information about the request addition to the manager or None if the request was not added." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Add a single request to the manager and store it in underlying resource client.\n" + } + ] + }, + "flags": {}, + "id": 2343, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "add_request", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The request object (or its string representation) to be added to the manager." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2344, + "kind": 32768, + "kindString": "Parameter", + "name": "request", + "type": { + "name": "str | Request", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Request", + "target": "150" + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Determines whether the request should be added to the beginning (if True) or the end (if False)\nof the manager.\n" + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2345, + "kind": 32768, + "kindString": "Parameter", + "name": "forefront", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "ProcessedRequest | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ProcessedRequest", + "target": "3665" + }, + { + "type": "literal", + "value": null + } + ] + }, + "overwrites": { + "name": "RequestManager.add_request", + "target": 2342, + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Add requests to the manager in batches.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2346, + "module": "request_loaders._request_manager", + "name": "add_requests", + "parsedDocstring": { + "text": "Add requests to the manager in batches.\n", + "args": { + "requests": "Requests to enqueue.", + "forefront": "If True, add requests to the beginning of the queue.", + "batch_size": "The number of requests to add in one batch.", + "wait_time_between_batches": "Time to wait between adding batches.", + "wait_for_all_requests_to_be_added": "If True, wait for all requests to be added before returning.", + "wait_for_all_requests_to_be_added_timeout": "Timeout for waiting for all requests to be added." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/request_loaders/_request_manager.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 43 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Add requests to the manager in batches.\n" + } + ] + }, + "flags": {}, + "id": 2347, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "add_requests", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Requests to enqueue." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2348, + "kind": 32768, + "kindString": "Parameter", + "name": "requests", + "type": { + "name": "Sequence", + "type": "reference", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Request", + "target": "150" + } + ] + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, add requests to the beginning of the queue." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2349, + "kind": 32768, + "kindString": "Parameter", + "name": "forefront", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The number of requests to add in one batch." + } + ] + }, + "defaultValue": "1000", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2350, + "kind": 32768, + "kindString": "Parameter", + "name": "batch_size", + "type": { + "name": "int", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Time to wait between adding batches." + } + ] + }, + "defaultValue": "timedelta(seconds=1)", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2351, + "kind": 32768, + "kindString": "Parameter", + "name": "wait_time_between_batches", + "type": { + "name": "timedelta", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, wait for all requests to be added before returning." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2352, + "kind": 32768, + "kindString": "Parameter", + "name": "wait_for_all_requests_to_be_added", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Timeout for waiting for all requests to be added." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2353, + "kind": 32768, + "kindString": "Parameter", + "name": "wait_for_all_requests_to_be_added_timeout", + "type": { + "name": "timedelta | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "timedelta" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "RequestManager.add_requests", + "target": 2346, + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Reclaims a failed request back to the source, so that it can be returned for processing later again.\n\nIt is possible to modify the request data by supplying an updated request as a parameter." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2354, + "module": "request_loaders._request_manager", + "name": "reclaim_request", + "parsedDocstring": { + "text": "Reclaims a failed request back to the source, so that it can be returned for processing later again.\n\nIt is possible to modify the request data by supplying an updated request as a parameter." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/request_loaders/_request_manager.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 71 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Reclaims a failed request back to the source, so that it can be returned for processing later again.\n\nIt is possible to modify the request data by supplying an updated request as a parameter." + } + ] + }, + "flags": {}, + "id": 2355, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "reclaim_request", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2356, + "kind": 32768, + "kindString": "Parameter", + "name": "request", + "type": { + "name": "Request", + "type": "reference", + "target": "150" + } + }, + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2357, + "kind": 32768, + "kindString": "Parameter", + "name": "forefront", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "ProcessedRequest | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ProcessedRequest", + "target": "3665" + }, + { + "type": "literal", + "value": null + } + ] + }, + "overwrites": { + "name": "RequestManager.reclaim_request", + "target": 2354, + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the number of requests in the loader that have been handled." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3995, + "module": "request_loaders._request_loader", + "name": "get_handled_count", + "parsedDocstring": { + "text": "Get the number of requests in the loader that have been handled." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/request_loaders/_request_loader.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 29 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the number of requests in the loader that have been handled." + } + ] + }, + "flags": {}, + "id": 2275, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_handled_count", + "parameters": [], + "type": { + "name": "int", + "type": "reference" + }, + "overwrites": { + "name": "RequestManager.get_handled_count", + "target": 3995, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "RequestLoader.get_handled_count", + "target": 2274, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get an offline approximation of the total number of requests in the loader (i.e. pending + handled)." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3996, + "module": "request_loaders._request_loader", + "name": "get_total_count", + "parsedDocstring": { + "text": "Get an offline approximation of the total number of requests in the loader (i.e. pending + handled)." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/request_loaders/_request_loader.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 33 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get an offline approximation of the total number of requests in the loader (i.e. pending + handled)." + } + ] + }, + "flags": {}, + "id": 2277, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_total_count", + "parameters": [], + "type": { + "name": "int", + "type": "reference" + }, + "overwrites": { + "name": "RequestManager.get_total_count", + "target": 3996, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "RequestLoader.get_total_count", + "target": 2276, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return True if there are no more requests in the loader (there might still be unfinished requests)." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3997, + "module": "request_loaders._request_loader", + "name": "is_empty", + "parsedDocstring": { + "text": "Return True if there are no more requests in the loader (there might still be unfinished requests)." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/request_loaders/_request_loader.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 37 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return True if there are no more requests in the loader (there might still be unfinished requests)." + } + ] + }, + "flags": {}, + "id": 2279, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "is_empty", + "parameters": [], + "type": { + "name": "bool", + "type": "reference" + }, + "overwrites": { + "name": "RequestManager.is_empty", + "target": 3997, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "RequestLoader.is_empty", + "target": 2278, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return True if all requests have been handled." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3998, + "module": "request_loaders._request_loader", + "name": "is_finished", + "parsedDocstring": { + "text": "Return True if all requests have been handled." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/request_loaders/_request_loader.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 41 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return True if all requests have been handled." + } + ] + }, + "flags": {}, + "id": 2281, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "is_finished", + "parameters": [], + "type": { + "name": "bool", + "type": "reference" + }, + "overwrites": { + "name": "RequestManager.is_finished", + "target": 3998, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "RequestLoader.is_finished", + "target": 2280, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the next request to be processed, or `None` if there are no more pending requests.\n\nThe method should return `None` if and only if `is_finished` would return `True`. In other cases, the method\nshould wait until a request appears." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3999, + "module": "request_loaders._request_loader", + "name": "fetch_next_request", + "parsedDocstring": { + "text": "Return the next request to be processed, or `None` if there are no more pending requests.\n\nThe method should return `None` if and only if `is_finished` would return `True`. In other cases, the method\nshould wait until a request appears." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/request_loaders/_request_loader.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 45 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the next request to be processed, or `None` if there are no more pending requests.\n\nThe method should return `None` if and only if `is_finished` would return `True`. In other cases, the method\nshould wait until a request appears." + } + ] + }, + "flags": {}, + "id": 2283, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "fetch_next_request", + "parameters": [], + "type": { + "name": "Request | None", + "type": "reference" + }, + "overwrites": { + "name": "RequestManager.fetch_next_request", + "target": 3999, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "RequestLoader.fetch_next_request", + "target": 2282, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Mark a request as handled after a successful processing (or after giving up retrying)." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 4000, + "module": "request_loaders._request_loader", + "name": "mark_request_as_handled", + "parsedDocstring": { + "text": "Mark a request as handled after a successful processing (or after giving up retrying)." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/request_loaders/_request_loader.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 53 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Mark a request as handled after a successful processing (or after giving up retrying)." + } + ] + }, + "flags": {}, + "id": 2285, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "mark_request_as_handled", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2286, + "kind": 32768, + "kindString": "Parameter", + "name": "request", + "type": { + "name": "Request", + "type": "reference", + "target": "150" + } + } + ], + "type": { + "name": "ProcessedRequest | None", + "type": "reference" + }, + "overwrites": { + "name": "RequestManager.mark_request_as_handled", + "target": 4000, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "RequestLoader.mark_request_as_handled", + "target": 2284, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Combine the loader with a request manager to support adding and reclaiming requests.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4001, + "module": "request_loaders._request_loader", + "name": "to_tandem", + "parsedDocstring": { + "text": "Combine the loader with a request manager to support adding and reclaiming requests.\n", + "args": { + "request_manager": "Request manager to combine the loader with.\nIf None is given, the default request queue is used." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/request_loaders/_request_loader.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 56 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Combine the loader with a request manager to support adding and reclaiming requests.\n" + } + ] + }, + "flags": {}, + "id": 2288, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "to_tandem", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Request manager to combine the loader with.\nIf None is given, the default request queue is used." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 2289, + "kind": 32768, + "kindString": "Parameter", + "name": "request_manager", + "type": { + "name": "RequestManager | None", + "type": "reference" + } + } + ], + "type": { + "name": "RequestManagerTandem", + "type": "reference", + "target": "2359" + }, + "inheritedFrom": { + "name": "RequestLoader.to_tandem", + "target": 2287, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "RequestLoader.to_tandem", + "target": 2287, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Base class that extends `RequestLoader` with the capability to enqueue new requests and reclaim failed ones." + } + ] + }, + "decorations": [ + { + "args": "('Request loaders')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 2342, + 2346, + 2340, + 3999, + 3995, + 3996, + 3997, + 3998, + 4000, + 2354, + 4001 + ], + "title": "Methods" + } + ], + "id": 2339, + "module": "request_loaders._request_manager", + "name": "RequestManager", + "parsedDocstring": { + "text": "Base class that extends `RequestLoader` with the capability to enqueue new requests and reclaim failed ones." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/request_loaders/_request_manager.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 18 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "RequestLoader", + "target": "2273", + "type": "reference" + } + ], + "extendedBy": [ + { + "name": "RequestManagerTandem", + "target": "2359", + "type": "reference" + }, + { + "name": "RequestQueue", + "target": "3852", + "type": "reference" + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2358, + "module": "request_loaders._request_manager_tandem", + "name": "logger", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/request_loaders/_request_manager_tandem.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 20 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2360, + "module": "request_loaders._request_manager_tandem", + "name": "__init__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/request_loaders/_request_manager_tandem.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 31 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 2361, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2362, + "kind": 32768, + "kindString": "Parameter", + "name": "request_loader", + "type": { + "name": "RequestLoader", + "type": "reference", + "target": "2273" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2363, + "kind": 32768, + "kindString": "Parameter", + "name": "request_manager", + "type": { + "name": "RequestManager", + "type": "reference", + "target": "2339" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the number of requests in the loader that have been handled." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2364, + "module": "request_loaders._request_loader", + "name": "get_handled_count", + "parsedDocstring": { + "text": "Get the number of requests in the loader that have been handled." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/request_loaders/_request_manager_tandem.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 36 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the number of requests in the loader that have been handled." + } + ] + }, + "flags": {}, + "id": 2275, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_handled_count", + "parameters": [], + "type": { + "name": "int", + "type": "reference" + }, + "overwrites": { + "name": "RequestManager.get_handled_count", + "target": 3995, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RequestManager.get_handled_count", + "target": 3995, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get an offline approximation of the total number of requests in the loader (i.e. pending + handled)." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2366, + "module": "request_loaders._request_loader", + "name": "get_total_count", + "parsedDocstring": { + "text": "Get an offline approximation of the total number of requests in the loader (i.e. pending + handled)." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/request_loaders/_request_manager_tandem.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 40 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get an offline approximation of the total number of requests in the loader (i.e. pending + handled)." + } + ] + }, + "flags": {}, + "id": 2277, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_total_count", + "parameters": [], + "type": { + "name": "int", + "type": "reference" + }, + "overwrites": { + "name": "RequestManager.get_total_count", + "target": 3996, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RequestManager.get_total_count", + "target": 3996, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return True if there are no more requests in the loader (there might still be unfinished requests)." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2368, + "module": "request_loaders._request_loader", + "name": "is_empty", + "parsedDocstring": { + "text": "Return True if there are no more requests in the loader (there might still be unfinished requests)." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/request_loaders/_request_manager_tandem.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 44 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return True if there are no more requests in the loader (there might still be unfinished requests)." + } + ] + }, + "flags": {}, + "id": 2279, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "is_empty", + "parameters": [], + "type": { + "name": "bool", + "type": "reference" + }, + "overwrites": { + "name": "RequestManager.is_empty", + "target": 3997, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RequestManager.is_empty", + "target": 3997, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return True if all requests have been handled." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2370, + "module": "request_loaders._request_loader", + "name": "is_finished", + "parsedDocstring": { + "text": "Return True if all requests have been handled." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/request_loaders/_request_manager_tandem.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 48 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return True if all requests have been handled." + } + ] + }, + "flags": {}, + "id": 2281, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "is_finished", + "parameters": [], + "type": { + "name": "bool", + "type": "reference" + }, + "overwrites": { + "name": "RequestManager.is_finished", + "target": 3998, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RequestManager.is_finished", + "target": 3998, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Add a single request to the manager and store it in underlying resource client.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2372, + "module": "request_loaders._request_manager", + "name": "add_request", + "parsedDocstring": { + "text": "Add a single request to the manager and store it in underlying resource client.\n", + "args": { + "request": "The request object (or its string representation) to be added to the manager.", + "forefront": "Determines whether the request should be added to the beginning (if True) or the end (if False)\nof the manager.\n" + }, + "returns": "Information about the request addition to the manager or None if the request was not added." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/request_loaders/_request_manager_tandem.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 52 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "Information about the request addition to the manager or None if the request was not added." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Add a single request to the manager and store it in underlying resource client.\n" + } + ] + }, + "flags": {}, + "id": 2343, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "add_request", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The request object (or its string representation) to be added to the manager." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2344, + "kind": 32768, + "kindString": "Parameter", + "name": "request", + "type": { + "name": "str | Request", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Request", + "target": "150" + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Determines whether the request should be added to the beginning (if True) or the end (if False)\nof the manager.\n" + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2345, + "kind": 32768, + "kindString": "Parameter", + "name": "forefront", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "ProcessedRequest | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ProcessedRequest", + "target": "3665" + }, + { + "type": "literal", + "value": null + } + ] + }, + "overwrites": { + "name": "RequestManager.add_request", + "target": 2342, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RequestManager.add_request", + "target": 2342, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Add requests to the manager in batches.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2376, + "module": "request_loaders._request_manager", + "name": "add_requests", + "parsedDocstring": { + "text": "Add requests to the manager in batches.\n", + "args": { + "requests": "Requests to enqueue.", + "forefront": "If True, add requests to the beginning of the queue.", + "batch_size": "The number of requests to add in one batch.", + "wait_time_between_batches": "Time to wait between adding batches.", + "wait_for_all_requests_to_be_added": "If True, wait for all requests to be added before returning.", + "wait_for_all_requests_to_be_added_timeout": "Timeout for waiting for all requests to be added." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/request_loaders/_request_manager_tandem.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 56 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Add requests to the manager in batches.\n" + } + ] + }, + "flags": {}, + "id": 2347, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "add_requests", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Requests to enqueue." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2348, + "kind": 32768, + "kindString": "Parameter", + "name": "requests", + "type": { + "name": "Sequence", + "type": "reference", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Request", + "target": "150" + } + ] + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, add requests to the beginning of the queue." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2349, + "kind": 32768, + "kindString": "Parameter", + "name": "forefront", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The number of requests to add in one batch." + } + ] + }, + "defaultValue": "1000", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2350, + "kind": 32768, + "kindString": "Parameter", + "name": "batch_size", + "type": { + "name": "int", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Time to wait between adding batches." + } + ] + }, + "defaultValue": "timedelta(seconds=1)", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2351, + "kind": 32768, + "kindString": "Parameter", + "name": "wait_time_between_batches", + "type": { + "name": "timedelta", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, wait for all requests to be added before returning." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2352, + "kind": 32768, + "kindString": "Parameter", + "name": "wait_for_all_requests_to_be_added", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Timeout for waiting for all requests to be added." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2353, + "kind": 32768, + "kindString": "Parameter", + "name": "wait_for_all_requests_to_be_added_timeout", + "type": { + "name": "timedelta | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "timedelta" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "RequestManager.add_requests", + "target": 2346, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RequestManager.add_requests", + "target": 2346, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the next request to be processed, or `None` if there are no more pending requests.\n\nThe method should return `None` if and only if `is_finished` would return `True`. In other cases, the method\nshould wait until a request appears." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2384, + "module": "request_loaders._request_loader", + "name": "fetch_next_request", + "parsedDocstring": { + "text": "Return the next request to be processed, or `None` if there are no more pending requests.\n\nThe method should return `None` if and only if `is_finished` would return `True`. In other cases, the method\nshould wait until a request appears." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/request_loaders/_request_manager_tandem.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 76 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the next request to be processed, or `None` if there are no more pending requests.\n\nThe method should return `None` if and only if `is_finished` would return `True`. In other cases, the method\nshould wait until a request appears." + } + ] + }, + "flags": {}, + "id": 2283, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "fetch_next_request", + "parameters": [], + "type": { + "name": "Request | None", + "type": "reference" + }, + "overwrites": { + "name": "RequestManager.fetch_next_request", + "target": 3999, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RequestManager.fetch_next_request", + "target": 3999, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Reclaims a failed request back to the source, so that it can be returned for processing later again.\n\nIt is possible to modify the request data by supplying an updated request as a parameter." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2386, + "module": "request_loaders._request_manager", + "name": "reclaim_request", + "parsedDocstring": { + "text": "Reclaims a failed request back to the source, so that it can be returned for processing later again.\n\nIt is possible to modify the request data by supplying an updated request as a parameter." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/request_loaders/_request_manager_tandem.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 99 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Reclaims a failed request back to the source, so that it can be returned for processing later again.\n\nIt is possible to modify the request data by supplying an updated request as a parameter." + } + ] + }, + "flags": {}, + "id": 2355, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "reclaim_request", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2356, + "kind": 32768, + "kindString": "Parameter", + "name": "request", + "type": { + "name": "Request", + "type": "reference", + "target": "150" + } + }, + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2357, + "kind": 32768, + "kindString": "Parameter", + "name": "forefront", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "ProcessedRequest | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ProcessedRequest", + "target": "3665" + }, + { + "type": "literal", + "value": null + } + ] + }, + "overwrites": { + "name": "RequestManager.reclaim_request", + "target": 2354, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RequestManager.reclaim_request", + "target": 2354, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Mark a request as handled after a successful processing (or after giving up retrying)." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2390, + "module": "request_loaders._request_loader", + "name": "mark_request_as_handled", + "parsedDocstring": { + "text": "Mark a request as handled after a successful processing (or after giving up retrying)." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/request_loaders/_request_manager_tandem.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 103 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Mark a request as handled after a successful processing (or after giving up retrying)." + } + ] + }, + "flags": {}, + "id": 2285, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "mark_request_as_handled", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2286, + "kind": 32768, + "kindString": "Parameter", + "name": "request", + "type": { + "name": "Request", + "type": "reference", + "target": "150" + } + } + ], + "type": { + "name": "ProcessedRequest | None", + "type": "reference" + }, + "overwrites": { + "name": "RequestManager.mark_request_as_handled", + "target": 4000, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RequestManager.mark_request_as_handled", + "target": 4000, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Remove persistent state either from the Apify Cloud storage or from the local database." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2393, + "module": "request_loaders._request_manager", + "name": "drop", + "parsedDocstring": { + "text": "Remove persistent state either from the Apify Cloud storage or from the local database." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/request_loaders/_request_manager_tandem.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 107 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Remove persistent state either from the Apify Cloud storage or from the local database." + } + ] + }, + "flags": {}, + "id": 2341, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "drop", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "RequestManager.drop", + "target": 2340, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RequestManager.drop", + "target": 2340, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Combine the loader with a request manager to support adding and reclaiming requests.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4003, + "module": "request_loaders._request_loader", + "name": "to_tandem", + "parsedDocstring": { + "text": "Combine the loader with a request manager to support adding and reclaiming requests.\n", + "args": { + "request_manager": "Request manager to combine the loader with.\nIf None is given, the default request queue is used." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/request_loaders/_request_loader.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 56 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Combine the loader with a request manager to support adding and reclaiming requests.\n" + } + ] + }, + "flags": {}, + "id": 2288, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "to_tandem", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Request manager to combine the loader with.\nIf None is given, the default request queue is used." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 2289, + "kind": 32768, + "kindString": "Parameter", + "name": "request_manager", + "type": { + "name": "RequestManager | None", + "type": "reference" + } + } + ], + "type": { + "name": "RequestManagerTandem", + "type": "reference", + "target": "2359" + }, + "inheritedFrom": { + "name": "RequestLoader.to_tandem", + "target": 2287, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "RequestLoader.to_tandem", + "target": 2287, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Implements a tandem behaviour for a pair of `RequestLoader` and `RequestManager`.\n\nIn this scenario, the contents of the \"loader\" get transferred into the \"manager\", allowing processing the requests\nfrom both sources and also enqueueing new requests (not possible with plain `RequestManager`)." + } + ] + }, + "decorations": [ + { + "args": "('Request loaders')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 2360, + 2372, + 2376, + 2393, + 2384, + 2364, + 2366, + 2368, + 2370, + 2390, + 2386, + 4003 + ], + "title": "Methods" + } + ], + "id": 2359, + "module": "request_loaders._request_manager_tandem", + "name": "RequestManagerTandem", + "parsedDocstring": { + "text": "Implements a tandem behaviour for a pair of `RequestLoader` and `RequestManager`.\n\nIn this scenario, the contents of the \"loader\" get transferred into the \"manager\", allowing processing the requests\nfrom both sources and also enqueueing new requests (not possible with plain `RequestManager`)." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/request_loaders/_request_manager_tandem.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 24 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "RequestManager", + "target": "2339", + "type": "reference" + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2395, + "module": "request_loaders._request_list", + "name": "logger", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/request_loaders/_request_list.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 16 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2397, + "module": "request_loaders._request_list", + "name": "model_config", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/request_loaders/_request_list.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 20 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2398, + "module": "request_loaders._request_list", + "name": "next_index", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/request_loaders/_request_list.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 22 + } + ], + "type": { + "name": "int", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2399, + "module": "request_loaders._request_list", + "name": "next_unique_key", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/request_loaders/_request_list.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 23 + } + ], + "type": { + "name": "Annotated[str | None, Field(alias='nextUniqueKey')]", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2400, + "module": "request_loaders._request_list", + "name": "in_progress", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/request_loaders/_request_list.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 24 + } + ], + "type": { + "name": "set", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ], + "target": "2562" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 2400, + 2397, + 2398, + 2399 + ], + "title": "Properties" + } + ], + "id": 2396, + "module": "request_loaders._request_list", + "name": "RequestListState", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/request_loaders/_request_list.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 19 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2402, + "module": "request_loaders._request_list", + "name": "requests", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/request_loaders/_request_list.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 28 + } + ], + "type": { + "name": "list", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Request", + "target": "150" + } + ] + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 2402 + ], + "title": "Properties" + } + ], + "id": 2401, + "module": "request_loaders._request_list", + "name": "RequestListData", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/request_loaders/_request_list.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 27 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2404, + "module": "request_loaders._request_list", + "name": "__init__", + "parsedDocstring": { + "text": "Initialize a new instance.\n", + "args": { + "requests": "The request objects (or their string representations) to be added to the provider.", + "name": "A name of the request list.", + "persist_state_key": "A key for persisting the progress information of the RequestList.\nIf you do not pass a key but pass a `name`, a key will be derived using the name.\nOtherwise, state will not be persisted.", + "persist_requests_key": "A key for persisting the request data loaded from the `requests` iterator.\nIf specified, the request data will be stored in the KeyValueStore to make sure that they don't change\nover time. This is useful if the `requests` iterator pulls the data dynamically." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/request_loaders/_request_list.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 35 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n" + } + ] + }, + "flags": {}, + "id": 2405, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The request objects (or their string representations) to be added to the provider." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 2406, + "kind": 32768, + "kindString": "Parameter", + "name": "requests", + "type": { + "name": "Iterable[str | Request] | AsyncIterable[str | Request] | None", + "type": "union", + "types": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "Iterable", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Request", + "target": "150" + } + ] + } + ] + }, + { + "type": "reference", + "name": "AsyncIterable", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Request", + "target": "150" + } + ] + } + ] + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A name of the request list." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 2407, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A key for persisting the progress information of the RequestList.\nIf you do not pass a key but pass a `name`, a key will be derived using the name.\nOtherwise, state will not be persisted." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 2408, + "kind": 32768, + "kindString": "Parameter", + "name": "persist_state_key", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A key for persisting the request data loaded from the `requests` iterator.\nIf specified, the request data will be stored in the KeyValueStore to make sure that they don't change\nover time. This is useful if the `requests` iterator pulls the data dynamically." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 2409, + "kind": 32768, + "kindString": "Parameter", + "name": "persist_requests_key", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 2410, + "module": "request_loaders._request_list", + "name": "name", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/request_loaders/_request_list.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 144 + } + ], + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the number of requests in the loader that have been handled." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2411, + "module": "request_loaders._request_loader", + "name": "get_handled_count", + "parsedDocstring": { + "text": "Get the number of requests in the loader that have been handled." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/request_loaders/_request_list.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 148 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the number of requests in the loader that have been handled." + } + ] + }, + "flags": {}, + "id": 2275, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_handled_count", + "parameters": [], + "type": { + "name": "int", + "type": "reference" + }, + "overwrites": { + "name": "RequestLoader.get_handled_count", + "target": 2274, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RequestLoader.get_handled_count", + "target": 2274, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get an offline approximation of the total number of requests in the loader (i.e. pending + handled)." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2413, + "module": "request_loaders._request_loader", + "name": "get_total_count", + "parsedDocstring": { + "text": "Get an offline approximation of the total number of requests in the loader (i.e. pending + handled)." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/request_loaders/_request_list.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 152 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get an offline approximation of the total number of requests in the loader (i.e. pending + handled)." + } + ] + }, + "flags": {}, + "id": 2277, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_total_count", + "parameters": [], + "type": { + "name": "int", + "type": "reference" + }, + "overwrites": { + "name": "RequestLoader.get_total_count", + "target": 2276, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RequestLoader.get_total_count", + "target": 2276, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return True if there are no more requests in the loader (there might still be unfinished requests)." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2415, + "module": "request_loaders._request_loader", + "name": "is_empty", + "parsedDocstring": { + "text": "Return True if there are no more requests in the loader (there might still be unfinished requests)." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/request_loaders/_request_list.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 156 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return True if there are no more requests in the loader (there might still be unfinished requests)." + } + ] + }, + "flags": {}, + "id": 2279, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "is_empty", + "parameters": [], + "type": { + "name": "bool", + "type": "reference" + }, + "overwrites": { + "name": "RequestLoader.is_empty", + "target": 2278, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RequestLoader.is_empty", + "target": 2278, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return True if all requests have been handled." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2417, + "module": "request_loaders._request_loader", + "name": "is_finished", + "parsedDocstring": { + "text": "Return True if all requests have been handled." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/request_loaders/_request_list.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 161 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return True if all requests have been handled." + } + ] + }, + "flags": {}, + "id": 2281, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "is_finished", + "parameters": [], + "type": { + "name": "bool", + "type": "reference" + }, + "overwrites": { + "name": "RequestLoader.is_finished", + "target": 2280, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RequestLoader.is_finished", + "target": 2280, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the next request to be processed, or `None` if there are no more pending requests.\n\nThe method should return `None` if and only if `is_finished` would return `True`. In other cases, the method\nshould wait until a request appears." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2419, + "module": "request_loaders._request_loader", + "name": "fetch_next_request", + "parsedDocstring": { + "text": "Return the next request to be processed, or `None` if there are no more pending requests.\n\nThe method should return `None` if and only if `is_finished` would return `True`. In other cases, the method\nshould wait until a request appears." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/request_loaders/_request_list.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 166 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the next request to be processed, or `None` if there are no more pending requests.\n\nThe method should return `None` if and only if `is_finished` would return `True`. In other cases, the method\nshould wait until a request appears." + } + ] + }, + "flags": {}, + "id": 2283, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "fetch_next_request", + "parameters": [], + "type": { + "name": "Request | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Request", + "target": "150" + }, + { + "type": "literal", + "value": null + } + ] + }, + "overwrites": { + "name": "RequestLoader.fetch_next_request", + "target": 2282, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RequestLoader.fetch_next_request", + "target": 2282, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Mark a request as handled after a successful processing (or after giving up retrying)." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2421, + "module": "request_loaders._request_loader", + "name": "mark_request_as_handled", + "parsedDocstring": { + "text": "Mark a request as handled after a successful processing (or after giving up retrying)." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/request_loaders/_request_list.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 188 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Mark a request as handled after a successful processing (or after giving up retrying)." + } + ] + }, + "flags": {}, + "id": 2285, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "mark_request_as_handled", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2286, + "kind": 32768, + "kindString": "Parameter", + "name": "request", + "type": { + "name": "Request", + "type": "reference", + "target": "150" + } + } + ], + "type": { + "name": "ProcessedRequest | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ProcessedRequest", + "target": "3665" + }, + { + "type": "literal", + "value": null + } + ] + }, + "overwrites": { + "name": "RequestLoader.mark_request_as_handled", + "target": 2284, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RequestLoader.mark_request_as_handled", + "target": 2284, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Combine the loader with a request manager to support adding and reclaiming requests.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4002, + "module": "request_loaders._request_loader", + "name": "to_tandem", + "parsedDocstring": { + "text": "Combine the loader with a request manager to support adding and reclaiming requests.\n", + "args": { + "request_manager": "Request manager to combine the loader with.\nIf None is given, the default request queue is used." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/request_loaders/_request_loader.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 56 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Combine the loader with a request manager to support adding and reclaiming requests.\n" + } + ] + }, + "flags": {}, + "id": 2288, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "to_tandem", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Request manager to combine the loader with.\nIf None is given, the default request queue is used." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 2289, + "kind": 32768, + "kindString": "Parameter", + "name": "request_manager", + "type": { + "name": "RequestManager | None", + "type": "reference" + } + } + ], + "type": { + "name": "RequestManagerTandem", + "type": "reference", + "target": "2359" + }, + "inheritedFrom": { + "name": "RequestLoader.to_tandem", + "target": 2287, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "RequestLoader.to_tandem", + "target": 2287, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Represents a (potentially very large) list of URLs to crawl." + } + ] + }, + "decorations": [ + { + "args": "('Request loaders')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 2404, + 2419, + 2411, + 2413, + 2415, + 2417, + 2421, + 4002 + ], + "title": "Methods" + }, + { + "children": [ + 2410 + ], + "title": "Properties" + } + ], + "id": 2403, + "module": "request_loaders._request_list", + "name": "RequestList", + "parsedDocstring": { + "text": "Represents a (potentially very large) list of URLs to crawl." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/request_loaders/_request_list.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 32 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "RequestLoader", + "target": "2273", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2425, + "module": "sessions._models", + "name": "model_config", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/sessions/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 23 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2426, + "module": "sessions._models", + "name": "id", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/sessions/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 25 + } + ], + "type": { + "name": "str", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2427, + "module": "sessions._models", + "name": "max_age", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/sessions/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 26 + } + ], + "type": { + "name": "timedelta", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2428, + "module": "sessions._models", + "name": "user_data", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/sessions/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 27 + } + ], + "type": { + "name": "dict", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2429, + "module": "sessions._models", + "name": "max_error_score", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/sessions/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 28 + } + ], + "type": { + "name": "float", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2430, + "module": "sessions._models", + "name": "error_score_decrement", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/sessions/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 29 + } + ], + "type": { + "name": "float", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2431, + "module": "sessions._models", + "name": "created_at", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/sessions/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 30 + } + ], + "type": { + "name": "datetime", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2432, + "module": "sessions._models", + "name": "usage_count", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/sessions/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 31 + } + ], + "type": { + "name": "int", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2433, + "module": "sessions._models", + "name": "max_usage_count", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/sessions/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 32 + } + ], + "type": { + "name": "int", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2434, + "module": "sessions._models", + "name": "error_score", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/sessions/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 33 + } + ], + "type": { + "name": "float", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2435, + "module": "sessions._models", + "name": "cookies", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/sessions/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 34 + } + ], + "type": { + "name": "list", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "CookieParam", + "target": "2538" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2436, + "module": "sessions._models", + "name": "blocked_status_codes", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/sessions/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 35 + } + ], + "type": { + "name": "list", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Model for a Session object." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 2436, + 2435, + 2431, + 2434, + 2430, + 2426, + 2427, + 2429, + 2433, + 2425, + 2432, + 2428 + ], + "title": "Properties" + } + ], + "id": 2424, + "module": "sessions._models", + "name": "SessionModel", + "parsedDocstring": { + "text": "Model for a Session object." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/sessions/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 20 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2438, + "module": "sessions._models", + "name": "model_config", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/sessions/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 41 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2439, + "module": "sessions._models", + "name": "max_pool_size", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/sessions/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 43 + } + ], + "type": { + "name": "int", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2440, + "module": "sessions._models", + "name": "sessions", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/sessions/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 45 + } + ], + "type": { + "name": "dict", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Session", + "target": "2445" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the total number of sessions currently maintained in the pool." + } + ] + }, + "decorations": [ + { + "args": "(alias='sessionCount')", + "name": "computed_field" + }, + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 2441, + "module": "sessions._models", + "name": "session_count", + "parsedDocstring": { + "text": "Get the total number of sessions currently maintained in the pool." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/sessions/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 68 + } + ], + "type": { + "name": "int", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the number of sessions that are currently usable." + } + ] + }, + "decorations": [ + { + "args": "(alias='usableSessionCount')", + "name": "computed_field" + }, + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 2442, + "module": "sessions._models", + "name": "usable_session_count", + "parsedDocstring": { + "text": "Get the number of sessions that are currently usable." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/sessions/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 74 + } + ], + "type": { + "name": "int", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the number of sessions that are no longer usable." + } + ] + }, + "decorations": [ + { + "args": "(alias='retiredSessionCount')", + "name": "computed_field" + }, + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 2443, + "module": "sessions._models", + "name": "retired_session_count", + "parsedDocstring": { + "text": "Get the number of sessions that are no longer usable." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/sessions/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 80 + } + ], + "type": { + "name": "int", + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Model for a SessionPool object." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 2439, + 2438, + 2443, + 2441, + 2440, + 2442 + ], + "title": "Properties" + } + ], + "id": 2437, + "module": "sessions._models", + "name": "SessionPoolModel", + "parsedDocstring": { + "text": "Model for a SessionPool object." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/sessions/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 38 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2444, + "module": "sessions._session", + "name": "logger", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/sessions/_session.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 18 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2446, + "module": "sessions._session", + "name": "__init__", + "parsedDocstring": { + "text": "Initialize a new instance.\n", + "args": { + "id": "Unique identifier for the session, autogenerated if not provided.", + "max_age": "Time duration after which the session expires.", + "user_data": "Custom user data associated with the session.", + "max_error_score": "Threshold score beyond which the session is considered blocked.", + "error_score_decrement": "Value by which the error score is decremented on successful operations.", + "created_at": "Timestamp when the session was created, defaults to current UTC time if not provided.", + "usage_count": "Number of times the session has been used.", + "max_usage_count": "Maximum allowable uses of the session before it is considered expired.", + "error_score": "Current error score of the session.", + "cookies": "Cookies associated with the session.", + "blocked_status_codes": "HTTP status codes that indicate a session should be blocked." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/sessions/_session.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 34 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n" + } + ] + }, + "flags": {}, + "id": 2447, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Unique identifier for the session, autogenerated if not provided." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2448, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Time duration after which the session expires." + } + ] + }, + "defaultValue": "timedelta(minutes=50)", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2449, + "kind": 32768, + "kindString": "Parameter", + "name": "max_age", + "type": { + "name": "timedelta", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Custom user data associated with the session." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2450, + "kind": 32768, + "kindString": "Parameter", + "name": "user_data", + "type": { + "name": "dict | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "dict" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Threshold score beyond which the session is considered blocked." + } + ] + }, + "defaultValue": "3.0", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2451, + "kind": 32768, + "kindString": "Parameter", + "name": "max_error_score", + "type": { + "name": "float", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Value by which the error score is decremented on successful operations." + } + ] + }, + "defaultValue": "0.5", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2452, + "kind": 32768, + "kindString": "Parameter", + "name": "error_score_decrement", + "type": { + "name": "float", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Timestamp when the session was created, defaults to current UTC time if not provided." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2453, + "kind": 32768, + "kindString": "Parameter", + "name": "created_at", + "type": { + "name": "datetime | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "datetime" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Number of times the session has been used." + } + ] + }, + "defaultValue": "0", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2454, + "kind": 32768, + "kindString": "Parameter", + "name": "usage_count", + "type": { + "name": "int", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum allowable uses of the session before it is considered expired." + } + ] + }, + "defaultValue": "50", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2455, + "kind": 32768, + "kindString": "Parameter", + "name": "max_usage_count", + "type": { + "name": "int", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Current error score of the session." + } + ] + }, + "defaultValue": "0.0", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2456, + "kind": 32768, + "kindString": "Parameter", + "name": "error_score", + "type": { + "name": "float", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Cookies associated with the session." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2457, + "kind": 32768, + "kindString": "Parameter", + "name": "cookies", + "type": { + "name": "SessionCookies | CookieJar | dict[str, str] | list[CookieParam] | None", + "type": "union", + "types": [ + { + "type": "union", + "types": [ + { + "type": "union", + "types": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "SessionCookies", + "target": "2557" + }, + { + "type": "reference", + "name": "CookieJar" + } + ] + }, + { + "type": "reference", + "name": "dict", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "str" + } + ] + } + ] + }, + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "CookieParam", + "target": "2538" + } + ] + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "HTTP status codes that indicate a session should be blocked." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2458, + "kind": 32768, + "kindString": "Parameter", + "name": "blocked_status_codes", + "type": { + "name": "list | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "list" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance from a `SessionModel`." + } + ] + }, + "decorations": [ + { + "name": "classmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2459, + "module": "sessions._session", + "name": "from_model", + "parsedDocstring": { + "text": "Initialize a new instance from a `SessionModel`." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/sessions/_session.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 77 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance from a `SessionModel`." + } + ] + }, + "flags": {}, + "id": 2460, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "from_model", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2461, + "kind": 32768, + "kindString": "Parameter", + "name": "model", + "type": { + "name": "SessionModel", + "type": "reference", + "target": "2424" + } + } + ], + "type": { + "name": "Session", + "type": "reference", + "target": "2445" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get a string representation." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2462, + "module": "sessions._session", + "name": "__repr__", + "parsedDocstring": { + "text": "Get a string representation." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/sessions/_session.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 82 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get a string representation." + } + ] + }, + "flags": {}, + "id": 2463, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__repr__", + "parameters": [], + "type": { + "name": "str", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Compare two sessions for equality." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2464, + "module": "sessions._session", + "name": "__eq__", + "parsedDocstring": { + "text": "Compare two sessions for equality." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/sessions/_session.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 86 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Compare two sessions for equality." + } + ] + }, + "flags": {}, + "id": 2465, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__eq__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2466, + "kind": 32768, + "kindString": "Parameter", + "name": "other", + "type": { + "name": "object", + "type": "reference" + } + } + ], + "type": { + "name": "bool", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return hash based on the session state." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2467, + "module": "sessions._session", + "name": "__hash__", + "parsedDocstring": { + "text": "Return hash based on the session state." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/sessions/_session.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 92 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return hash based on the session state." + } + ] + }, + "flags": {}, + "id": 2468, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__hash__", + "parameters": [], + "type": { + "name": "int", + "type": "reference" + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the session ID." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 2469, + "module": "sessions._session", + "name": "id", + "parsedDocstring": { + "text": "Get the session ID." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/sessions/_session.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 115 + } + ], + "type": { + "name": "str", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the user data." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 2470, + "module": "sessions._session", + "name": "user_data", + "parsedDocstring": { + "text": "Get the user data." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/sessions/_session.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 120 + } + ], + "type": { + "name": "dict", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the cookies." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 2471, + "module": "sessions._session", + "name": "cookies", + "parsedDocstring": { + "text": "Get the cookies." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/sessions/_session.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 125 + } + ], + "type": { + "name": "SessionCookies", + "type": "reference", + "target": "2557" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the current error score." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 2472, + "module": "sessions._session", + "name": "error_score", + "parsedDocstring": { + "text": "Get the current error score." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/sessions/_session.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 130 + } + ], + "type": { + "name": "float", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the current usage count." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 2473, + "module": "sessions._session", + "name": "usage_count", + "parsedDocstring": { + "text": "Get the current usage count." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/sessions/_session.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 135 + } + ], + "type": { + "name": "float", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the expiration datetime of the session." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 2474, + "module": "sessions._session", + "name": "expires_at", + "parsedDocstring": { + "text": "Get the expiration datetime of the session." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/sessions/_session.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 140 + } + ], + "type": { + "name": "datetime", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Indicate whether the session is blocked based on the error score.." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 2475, + "module": "sessions._session", + "name": "is_blocked", + "parsedDocstring": { + "text": "Indicate whether the session is blocked based on the error score.." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/sessions/_session.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 145 + } + ], + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Indicate whether the session is expired based on the current time." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 2476, + "module": "sessions._session", + "name": "is_expired", + "parsedDocstring": { + "text": "Indicate whether the session is expired based on the current time." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/sessions/_session.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 150 + } + ], + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Indicate whether the session has reached its maximum usage limit." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 2477, + "module": "sessions._session", + "name": "is_max_usage_count_reached", + "parsedDocstring": { + "text": "Indicate whether the session has reached its maximum usage limit." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/sessions/_session.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 155 + } + ], + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Determine if the session is usable for next requests." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 2478, + "module": "sessions._session", + "name": "is_usable", + "parsedDocstring": { + "text": "Determine if the session is usable for next requests." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/sessions/_session.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 160 + } + ], + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Retrieve the current state of the session either as a model or as a dictionary." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2479, + "module": "sessions._session", + "name": "get_state", + "parsedDocstring": { + "text": "Retrieve the current state of the session either as a model or as a dictionary." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/sessions/_session.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 170 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Retrieve the current state of the session either as a model or as a dictionary." + } + ] + }, + "flags": {}, + "id": 2480, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "get_state", + "parameters": [ + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2481, + "kind": 32768, + "kindString": "Parameter", + "name": "as_dict", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "SessionModel | dict", + "type": "union", + "types": [ + { + "type": "reference", + "name": "SessionModel", + "target": "2424" + }, + { + "type": "reference", + "name": "dict" + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Retrieve the current state of the session either as a model or as a dictionary." + } + ] + }, + "flags": {}, + "id": 2492, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "get_state", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2493, + "kind": 32768, + "kindString": "Parameter", + "name": "as_dict", + "type": { + "name": "Literal", + "type": "reference", + "typeArguments": [ + { + "type": "literal", + "value": true + } + ] + } + } + ], + "type": { + "name": "dict", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Retrieve the current state of the session either as a model or as a dictionary." + } + ] + }, + "flags": {}, + "id": 2494, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "get_state", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2495, + "kind": 32768, + "kindString": "Parameter", + "name": "as_dict", + "type": { + "name": "Literal", + "type": "reference", + "typeArguments": [ + { + "type": "literal", + "value": false + } + ] + } + } + ], + "type": { + "name": "SessionModel", + "type": "reference", + "target": "2424" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Mark the session as good. Should be called after a successful session usage." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2482, + "module": "sessions._session", + "name": "mark_good", + "parsedDocstring": { + "text": "Mark the session as good. Should be called after a successful session usage." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/sessions/_session.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 191 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Mark the session as good. Should be called after a successful session usage." + } + ] + }, + "flags": {}, + "id": 2483, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "mark_good", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Mark the session as bad after an unsuccessful session usage." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2484, + "module": "sessions._session", + "name": "mark_bad", + "parsedDocstring": { + "text": "Mark the session as bad after an unsuccessful session usage." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/sessions/_session.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 202 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Mark the session as bad after an unsuccessful session usage." + } + ] + }, + "flags": {}, + "id": 2485, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "mark_bad", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Retire the session by setting the error score to the maximum value.\n\nThis method should be used if the session usage was unsuccessful and you are sure that it is because of\nthe session configuration and not any external matters. For example when server returns 403 status code.\nIf the session does not work due to some external factors as server error such as 5XX you probably want\nto use `mark_bad` method." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2486, + "module": "sessions._session", + "name": "retire", + "parsedDocstring": { + "text": "Retire the session by setting the error score to the maximum value.\n\nThis method should be used if the session usage was unsuccessful and you are sure that it is because of\nthe session configuration and not any external matters. For example when server returns 403 status code.\nIf the session does not work due to some external factors as server error such as 5XX you probably want\nto use `mark_bad` method." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/sessions/_session.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 211 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Retire the session by setting the error score to the maximum value.\n\nThis method should be used if the session usage was unsuccessful and you are sure that it is because of\nthe session configuration and not any external matters. For example when server returns 403 status code.\nIf the session does not work due to some external factors as server error such as 5XX you probably want\nto use `mark_bad` method." + } + ] + }, + "flags": {}, + "id": 2487, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "retire", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Evaluate whether a session should be retired based on the received HTTP status code.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2488, + "module": "sessions._session", + "name": "is_blocked_status_code", + "parsedDocstring": { + "text": "Evaluate whether a session should be retired based on the received HTTP status code.\n", + "args": { + "status_code": "The HTTP status code received from a server response.", + "ignore_http_error_status_codes": "Optional status codes to allow suppression of\ncodes from `blocked_status_codes`.\n" + }, + "returns": "True if the session should be retired, False otherwise." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/sessions/_session.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 223 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "True if the session should be retired, False otherwise." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Evaluate whether a session should be retired based on the received HTTP status code.\n" + } + ] + }, + "flags": {}, + "id": 2489, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "is_blocked_status_code", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The HTTP status code received from a server response." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2490, + "kind": 32768, + "kindString": "Parameter", + "name": "status_code", + "type": { + "name": "int", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Optional status codes to allow suppression of\ncodes from `blocked_status_codes`.\n" + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2491, + "kind": 32768, + "kindString": "Parameter", + "name": "ignore_http_error_status_codes", + "type": { + "name": "set[int] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "set", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ], + "target": "2562" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "bool", + "type": "reference" + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Represent a single user session, managing cookies, error states, and usage limits.\n\nA `Session` simulates a specific user with attributes like cookies, IP (via proxy), and potentially\na unique browser fingerprint. It maintains its internal state, which can include custom user data\n(e.g., authorization tokens or headers) and tracks its usability through metrics such as error score,\nusage count, and expiration." + } + ] + }, + "decorations": [ + { + "args": "('Session management')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 2464, + 2467, + 2446, + 2462, + 2459, + 2479, + 2488, + 2484, + 2482, + 2486 + ], + "title": "Methods" + }, + { + "children": [ + 2471, + 2472, + 2474, + 2469, + 2475, + 2476, + 2477, + 2478, + 2473, + 2470 + ], + "title": "Properties" + } + ], + "id": 2445, + "module": "sessions._session", + "name": "Session", + "parsedDocstring": { + "text": "Represent a single user session, managing cookies, error states, and usage limits.\n\nA `Session` simulates a specific user with attributes like cookies, IP (via proxy), and potentially\na unique browser fingerprint. It maintains its internal state, which can include custom user data\n(e.g., authorization tokens or headers) and tracks its usability through metrics such as error score,\nusage count, and expiration." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/sessions/_session.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 22 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2496, + "module": "sessions._session_pool", + "name": "logger", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/sessions/_session_pool.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 22 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2497, + "module": "sessions._session_pool", + "name": "CreateSessionFunctionType", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/sessions/_session_pool.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 24 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2499, + "module": "sessions._session_pool", + "name": "__init__", + "parsedDocstring": { + "text": "Initialize a new instance.\n", + "args": { + "max_pool_size": "Maximum number of sessions to maintain in the pool. You can add more sessions to the pool\nby using the `add_session` method.", + "create_session_settings": "Settings for creating new session instances. If None, default settings will\nbe used. Do not set it if you are providing a `create_session_function`.", + "create_session_function": "A callable to create new session instances. If None, a default session settings\nwill be used. Do not set it if you are providing `create_session_settings`.", + "event_manager": "The event manager to handle events like persist state.", + "persistence_enabled": "Flag to enable or disable state persistence of the pool.", + "persist_state_kvs_name": "The name of the `KeyValueStore` used for state persistence.", + "persist_state_key": "The key under which the session pool's state is stored in the `KeyValueStore`." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/sessions/_session_pool.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 36 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n" + } + ] + }, + "flags": {}, + "id": 2500, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum number of sessions to maintain in the pool. You can add more sessions to the pool\nby using the `add_session` method." + } + ] + }, + "defaultValue": "1000", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2501, + "kind": 32768, + "kindString": "Parameter", + "name": "max_pool_size", + "type": { + "name": "int", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Settings for creating new session instances. If None, default settings will\nbe used. Do not set it if you are providing a `create_session_function`." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2502, + "kind": 32768, + "kindString": "Parameter", + "name": "create_session_settings", + "type": { + "name": "dict | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "dict" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A callable to create new session instances. If None, a default session settings\nwill be used. Do not set it if you are providing `create_session_settings`." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2503, + "kind": 32768, + "kindString": "Parameter", + "name": "create_session_function", + "type": { + "name": "CreateSessionFunctionType | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "CreateSessionFunctionType", + "target": "2497" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The event manager to handle events like persist state." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2504, + "kind": 32768, + "kindString": "Parameter", + "name": "event_manager", + "type": { + "name": "EventManager | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "EventManager", + "target": "1907" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Flag to enable or disable state persistence of the pool." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2505, + "kind": 32768, + "kindString": "Parameter", + "name": "persistence_enabled", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The name of the `KeyValueStore` used for state persistence." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2506, + "kind": 32768, + "kindString": "Parameter", + "name": "persist_state_kvs_name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The key under which the session pool's state is stored in the `KeyValueStore`." + } + ] + }, + "defaultValue": "'CRAWLEE_SESSION_POOL_STATE'", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2507, + "kind": 32768, + "kindString": "Parameter", + "name": "persist_state_key", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get a string representation." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2508, + "module": "sessions._session_pool", + "name": "__repr__", + "parsedDocstring": { + "text": "Get a string representation." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/sessions/_session_pool.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 86 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get a string representation." + } + ] + }, + "flags": {}, + "id": 2509, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__repr__", + "parameters": [], + "type": { + "name": "str", + "type": "reference" + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the total number of sessions currently maintained in the pool." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 2510, + "module": "sessions._session_pool", + "name": "session_count", + "parsedDocstring": { + "text": "Get the total number of sessions currently maintained in the pool." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/sessions/_session_pool.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 91 + } + ], + "type": { + "name": "int", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the number of sessions that are currently usable." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 2511, + "module": "sessions._session_pool", + "name": "usable_session_count", + "parsedDocstring": { + "text": "Get the number of sessions that are currently usable." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/sessions/_session_pool.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 96 + } + ], + "type": { + "name": "int", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the number of sessions that are no longer usable." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 2512, + "module": "sessions._session_pool", + "name": "retired_session_count", + "parsedDocstring": { + "text": "Get the number of sessions that are no longer usable." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/sessions/_session_pool.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 101 + } + ], + "type": { + "name": "int", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Indicate whether the context is active." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 2513, + "module": "sessions._session_pool", + "name": "active", + "parsedDocstring": { + "text": "Indicate whether the context is active." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/sessions/_session_pool.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 106 + } + ], + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize the pool upon entering the context manager.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2514, + "module": "sessions._session_pool", + "name": "__aenter__", + "parsedDocstring": { + "text": "Initialize the pool upon entering the context manager.\n" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/sessions/_session_pool.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 110 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize the pool upon entering the context manager.\n" + } + ] + }, + "flags": {}, + "id": 2515, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "__aenter__", + "parameters": [], + "type": { + "name": "SessionPool", + "type": "reference", + "target": "2498" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Deinitialize the pool upon exiting the context manager.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2516, + "module": "sessions._session_pool", + "name": "__aexit__", + "parsedDocstring": { + "text": "Deinitialize the pool upon exiting the context manager.\n" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/sessions/_session_pool.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 130 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Deinitialize the pool upon exiting the context manager.\n" + } + ] + }, + "flags": {}, + "id": 2517, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "__aexit__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2518, + "kind": 32768, + "kindString": "Parameter", + "name": "exc_type", + "type": { + "name": "type[BaseException] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "type", + "typeArguments": [ + { + "type": "reference", + "name": "BaseException" + } + ], + "target": "981" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2519, + "kind": 32768, + "kindString": "Parameter", + "name": "exc_value", + "type": { + "name": "BaseException | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "BaseException" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2520, + "kind": 32768, + "kindString": "Parameter", + "name": "exc_traceback", + "type": { + "name": "TracebackType | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "TracebackType" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Retrieve the current state of the pool either as a model or as a dictionary." + } + ] + }, + "decorations": [ + { + "name": "ensure_context" + } + ], + "flags": {}, + "groups": [], + "id": 2521, + "module": "sessions._session_pool", + "name": "get_state", + "parsedDocstring": { + "text": "Retrieve the current state of the pool either as a model or as a dictionary." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/sessions/_session_pool.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 155 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Retrieve the current state of the pool either as a model or as a dictionary." + } + ] + }, + "flags": {}, + "id": 2522, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "get_state", + "parameters": [ + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2523, + "kind": 32768, + "kindString": "Parameter", + "name": "as_dict", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "SessionPoolModel | dict", + "type": "union", + "types": [ + { + "type": "reference", + "name": "SessionPoolModel", + "target": "2437" + }, + { + "type": "reference", + "name": "dict" + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Retrieve the current state of the pool either as a model or as a dictionary." + } + ] + }, + "flags": {}, + "id": 2534, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "get_state", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2535, + "kind": 32768, + "kindString": "Parameter", + "name": "as_dict", + "type": { + "name": "Literal", + "type": "reference", + "typeArguments": [ + { + "type": "literal", + "value": true + } + ] + } + } + ], + "type": { + "name": "dict", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Retrieve the current state of the pool either as a model or as a dictionary." + } + ] + }, + "flags": {}, + "id": 2536, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "get_state", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2537, + "kind": 32768, + "kindString": "Parameter", + "name": "as_dict", + "type": { + "name": "Literal", + "type": "reference", + "typeArguments": [ + { + "type": "literal", + "value": false + } + ] + } + } + ], + "type": { + "name": "SessionPoolModel", + "type": "reference", + "target": "2437" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Add an externally created session to the pool.\n\nThis is intended only for the cases when you want to add a session that was created outside of the pool.\nOtherwise, the pool will create new sessions automatically.\n" + } + ] + }, + "decorations": [ + { + "name": "ensure_context" + } + ], + "flags": {}, + "groups": [], + "id": 2524, + "module": "sessions._session_pool", + "name": "add_session", + "parsedDocstring": { + "text": "Add an externally created session to the pool.\n\nThis is intended only for the cases when you want to add a session that was created outside of the pool.\nOtherwise, the pool will create new sessions automatically.\n", + "args": { + "session": "The session to add to the pool." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/sessions/_session_pool.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 163 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Add an externally created session to the pool.\n\nThis is intended only for the cases when you want to add a session that was created outside of the pool.\nOtherwise, the pool will create new sessions automatically.\n" + } + ] + }, + "flags": {}, + "id": 2525, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "add_session", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The session to add to the pool." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2526, + "kind": 32768, + "kindString": "Parameter", + "name": "session", + "type": { + "name": "Session", + "type": "reference", + "target": "2445" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Retrieve a random session from the pool.\n\nThis method first ensures the session pool is at its maximum capacity. If the random session is not usable,\nretired sessions are removed and a new session is created and returned.\n" + } + ] + }, + "decorations": [ + { + "name": "ensure_context" + } + ], + "flags": {}, + "groups": [], + "id": 2527, + "module": "sessions._session_pool", + "name": "get_session", + "parsedDocstring": { + "text": "Retrieve a random session from the pool.\n\nThis method first ensures the session pool is at its maximum capacity. If the random session is not usable,\nretired sessions are removed and a new session is created and returned.\n", + "returns": "The session object." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/sessions/_session_pool.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 180 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "The session object." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Retrieve a random session from the pool.\n\nThis method first ensures the session pool is at its maximum capacity. If the random session is not usable,\nretired sessions are removed and a new session is created and returned.\n" + } + ] + }, + "flags": {}, + "id": 2528, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_session", + "parameters": [], + "type": { + "name": "Session", + "type": "reference", + "target": "2445" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Retrieve a session by ID from the pool.\n\nThis method first ensures the session pool is at its maximum capacity. It then tries to retrieve a specific\nsession by ID. If the session is not found or not usable, `None` is returned.\n" + } + ] + }, + "decorations": [ + { + "name": "ensure_context" + } + ], + "flags": {}, + "groups": [], + "id": 2529, + "module": "sessions._session_pool", + "name": "get_session_by_id", + "parsedDocstring": { + "text": "Retrieve a session by ID from the pool.\n\nThis method first ensures the session pool is at its maximum capacity. It then tries to retrieve a specific\nsession by ID. If the session is not found or not usable, `None` is returned.\n", + "args": { + "session_id": "The ID of the session to retrieve.\n" + }, + "returns": "The session object if found and usable, otherwise `None`." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/sessions/_session_pool.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 200 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "The session object if found and usable, otherwise `None`." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Retrieve a session by ID from the pool.\n\nThis method first ensures the session pool is at its maximum capacity. It then tries to retrieve a specific\nsession by ID. If the session is not found or not usable, `None` is returned.\n" + } + ] + }, + "flags": {}, + "id": 2530, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_session_by_id", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The ID of the session to retrieve.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2531, + "kind": 32768, + "kindString": "Parameter", + "name": "session_id", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "Session | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Session", + "target": "2445" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Reset the KVS where the pool state is persisted." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2532, + "module": "sessions._session_pool", + "name": "reset_store", + "parsedDocstring": { + "text": "Reset the KVS where the pool state is persisted." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/sessions/_session_pool.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 225 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Reset the KVS where the pool state is persisted." + } + ] + }, + "flags": {}, + "id": 2533, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "reset_store", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A pool of sessions that are managed, rotated, and persisted based on usage and age.\n\nIt ensures effective session management by maintaining a pool of sessions and rotating them based on\nusage count, expiration time, or custom rules. It provides methods to retrieve sessions, manage their\nlifecycle, and optionally persist the state to enable recovery." + } + ] + }, + "decorations": [ + { + "args": "('Session management')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 2514, + 2516, + 2499, + 2508, + 2524, + 2527, + 2529, + 2521, + 2532 + ], + "title": "Methods" + }, + { + "children": [ + 2513, + 2512, + 2510, + 2511 + ], + "title": "Properties" + } + ], + "id": 2498, + "module": "sessions._session_pool", + "name": "SessionPool", + "parsedDocstring": { + "text": "A pool of sessions that are managed, rotated, and persisted based on usage and age.\n\nIt ensures effective session management by maintaining a pool of sessions and rotating them based on\nusage count, expiration time, or custom rules. It provides methods to retrieve sessions, manage their\nlifecycle, and optionally persist the state to enable recovery." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/sessions/_session_pool.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 28 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Cookie name." + } + ] + }, + "flags": {}, + "groups": [], + "id": 2539, + "module": "sessions._cookies", + "name": "name", + "parsedDocstring": { + "text": "Cookie name." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/sessions/_cookies.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 20 + } + ], + "type": { + "name": "Required", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Cookie value." + } + ] + }, + "flags": {}, + "groups": [], + "id": 2540, + "module": "sessions._cookies", + "name": "value", + "parsedDocstring": { + "text": "Cookie value." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/sessions/_cookies.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 23 + } + ], + "type": { + "name": "Required", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Domain for which the cookie is set." + } + ] + }, + "flags": {}, + "groups": [], + "id": 2541, + "module": "sessions._cookies", + "name": "domain", + "parsedDocstring": { + "text": "Domain for which the cookie is set." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/sessions/_cookies.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 26 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Path on the specified domain for which the cookie is set." + } + ] + }, + "flags": {}, + "groups": [], + "id": 2542, + "module": "sessions._cookies", + "name": "path", + "parsedDocstring": { + "text": "Path on the specified domain for which the cookie is set." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/sessions/_cookies.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 29 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Set the `Secure` flag for the cookie." + } + ] + }, + "flags": {}, + "groups": [], + "id": 2543, + "module": "sessions._cookies", + "name": "secure", + "parsedDocstring": { + "text": "Set the `Secure` flag for the cookie." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/sessions/_cookies.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 32 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Set the `HttpOnly` flag for the cookie." + } + ] + }, + "flags": {}, + "groups": [], + "id": 2544, + "module": "sessions._cookies", + "name": "http_only", + "parsedDocstring": { + "text": "Set the `HttpOnly` flag for the cookie." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/sessions/_cookies.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 35 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Expiration date for the cookie, None for a session cookie." + } + ] + }, + "flags": {}, + "groups": [], + "id": 2545, + "module": "sessions._cookies", + "name": "expires", + "parsedDocstring": { + "text": "Expiration date for the cookie, None for a session cookie." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/sessions/_cookies.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 38 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Set the `SameSite` attribute for the cookie." + } + ] + }, + "flags": {}, + "groups": [], + "id": 2546, + "module": "sessions._cookies", + "name": "same_site", + "parsedDocstring": { + "text": "Set the `SameSite` attribute for the cookie." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/sessions/_cookies.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 41 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Literal", + "typeArguments": [ + { + "type": "literal", + "value": "Lax" + }, + { + "type": "literal", + "value": "None" + }, + { + "type": "literal", + "value": "Strict" + } + ] + } + ] + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Dictionary representation of cookies for `SessionCookies.set` method." + } + ] + }, + "decorations": [ + { + "args": "('Session management')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 2541, + 2545, + 2544, + 2539, + 2542, + 2546, + 2543, + 2540 + ], + "title": "Properties" + } + ], + "id": 2538, + "module": "sessions._cookies", + "name": "CookieParam", + "parsedDocstring": { + "text": "Dictionary representation of cookies for `SessionCookies.set` method." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/sessions/_cookies.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 17 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2548, + "module": "sessions._cookies", + "name": "name", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/sessions/_cookies.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 48 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2549, + "module": "sessions._cookies", + "name": "value", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/sessions/_cookies.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 49 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2550, + "module": "sessions._cookies", + "name": "domain", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/sessions/_cookies.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 50 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2551, + "module": "sessions._cookies", + "name": "path", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/sessions/_cookies.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 51 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2552, + "module": "sessions._cookies", + "name": "secure", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/sessions/_cookies.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 52 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2553, + "module": "sessions._cookies", + "name": "httpOnly", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/sessions/_cookies.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 53 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2554, + "module": "sessions._cookies", + "name": "expires", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/sessions/_cookies.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 54 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "float" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2555, + "module": "sessions._cookies", + "name": "sameSite", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/sessions/_cookies.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 55 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Literal", + "typeArguments": [ + { + "type": "literal", + "value": "Lax" + }, + { + "type": "literal", + "value": "None" + }, + { + "type": "literal", + "value": "Strict" + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2556, + "module": "sessions._cookies", + "name": "partitionKey", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/sessions/_cookies.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 56 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + ] + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Cookie parameters in Playwright format with camelCase naming." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 2550, + 2554, + 2553, + 2548, + 2556, + 2551, + 2555, + 2552, + 2549 + ], + "title": "Properties" + } + ], + "id": 2547, + "module": "sessions._cookies", + "name": "PlaywrightCookieParam", + "parsedDocstring": { + "text": "Cookie parameters in Playwright format with camelCase naming." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/sessions/_cookies.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 45 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2558, + "module": "sessions._cookies", + "name": "__init__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/sessions/_cookies.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 63 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 2559, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 2560, + "kind": 32768, + "kindString": "Parameter", + "name": "cookies", + "type": { + "name": "SessionCookies | CookieJar | dict[str, str] | list[CookieParam] | None", + "type": "union", + "types": [ + { + "type": "union", + "types": [ + { + "type": "union", + "types": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "SessionCookies", + "target": "2557" + }, + { + "type": "reference", + "name": "CookieJar" + } + ] + }, + { + "type": "reference", + "name": "dict", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "str" + } + ] + } + ] + }, + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "CookieParam", + "target": "2538" + } + ] + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The cookie jar instance." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 2561, + "module": "sessions._cookies", + "name": "jar", + "parsedDocstring": { + "text": "The cookie jar instance." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/sessions/_cookies.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 84 + } + ], + "type": { + "name": "CookieJar", + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create and store a cookie with modern browser attributes.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2562, + "module": "sessions._cookies", + "name": "set", + "parsedDocstring": { + "text": "Create and store a cookie with modern browser attributes.\n", + "args": { + "name": "Cookie name.", + "value": "Cookie value.", + "domain": "Cookie domain.", + "path": "Cookie path.", + "expires": "Cookie expiration timestamp.", + "http_only": "Whether cookie is HTTP-only.", + "secure": "Whether cookie requires secure context.", + "same_site": "SameSite cookie attribute value." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/sessions/_cookies.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 88 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create and store a cookie with modern browser attributes.\n" + } + ] + }, + "flags": {}, + "id": 2563, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "set", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Cookie name." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2564, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Cookie value." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2565, + "kind": 32768, + "kindString": "Parameter", + "name": "value", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Cookie domain." + } + ] + }, + "defaultValue": "''", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2566, + "kind": 32768, + "kindString": "Parameter", + "name": "domain", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Cookie path." + } + ] + }, + "defaultValue": "'/'", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2567, + "kind": 32768, + "kindString": "Parameter", + "name": "path", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Cookie expiration timestamp." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2568, + "kind": 32768, + "kindString": "Parameter", + "name": "expires", + "type": { + "name": "int | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Whether cookie is HTTP-only." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2569, + "kind": 32768, + "kindString": "Parameter", + "name": "http_only", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Whether cookie requires secure context." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2570, + "kind": 32768, + "kindString": "Parameter", + "name": "secure", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "SameSite cookie attribute value." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2571, + "kind": 32768, + "kindString": "Parameter", + "name": "same_site", + "type": { + "name": "Literal['Lax', 'None', 'Strict'] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Literal", + "typeArguments": [ + { + "type": "literal", + "value": "Lax" + }, + { + "type": "literal", + "value": "None" + }, + { + "type": "literal", + "value": "Strict" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2572, + "kind": 32768, + "kindString": "Parameter", + "name": "_kwargs", + "type": { + "name": "Any", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Convert cookies to a list with `CookieParam` dicts." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2573, + "module": "sessions._cookies", + "name": "get_cookies_as_dicts", + "parsedDocstring": { + "text": "Convert cookies to a list with `CookieParam` dicts." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/sessions/_cookies.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 188 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Convert cookies to a list with `CookieParam` dicts." + } + ] + }, + "flags": {}, + "id": 2574, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "get_cookies_as_dicts", + "parameters": [], + "type": { + "name": "list", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "CookieParam", + "target": "2538" + } + ] + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Store a Cookie object in the session cookie jar.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2575, + "module": "sessions._cookies", + "name": "store_cookie", + "parsedDocstring": { + "text": "Store a Cookie object in the session cookie jar.\n", + "args": { + "cookie": "The Cookie object to store in the jar." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/sessions/_cookies.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 192 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Store a Cookie object in the session cookie jar.\n" + } + ] + }, + "flags": {}, + "id": 2576, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "store_cookie", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The Cookie object to store in the jar." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2577, + "kind": 32768, + "kindString": "Parameter", + "name": "cookie", + "type": { + "name": "Cookie", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Store multiple cookie objects in the session cookie jar.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2578, + "module": "sessions._cookies", + "name": "store_cookies", + "parsedDocstring": { + "text": "Store multiple cookie objects in the session cookie jar.\n", + "args": { + "cookies": "A list of cookie objects to store in the jar." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/sessions/_cookies.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 200 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Store multiple cookie objects in the session cookie jar.\n" + } + ] + }, + "flags": {}, + "id": 2579, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "store_cookies", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A list of cookie objects to store in the jar." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2580, + "kind": 32768, + "kindString": "Parameter", + "name": "cookies", + "type": { + "name": "list", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Cookie" + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create and store cookies from their dictionary representations.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2581, + "module": "sessions._cookies", + "name": "set_cookies", + "parsedDocstring": { + "text": "Create and store cookies from their dictionary representations.\n", + "args": { + "cookie_dicts": "List of dictionaries where each dict represents cookie parameters." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/sessions/_cookies.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 210 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create and store cookies from their dictionary representations.\n" + } + ] + }, + "flags": {}, + "id": 2582, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "set_cookies", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "List of dictionaries where each dict represents cookie parameters." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2583, + "kind": 32768, + "kindString": "Parameter", + "name": "cookie_dicts", + "type": { + "name": "list", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "CookieParam", + "target": "2538" + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get cookies in playwright format." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2584, + "module": "sessions._cookies", + "name": "get_cookies_as_playwright_format", + "parsedDocstring": { + "text": "Get cookies in playwright format." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/sessions/_cookies.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 220 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get cookies in playwright format." + } + ] + }, + "flags": {}, + "id": 2585, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "get_cookies_as_playwright_format", + "parameters": [], + "type": { + "name": "list", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "PlaywrightCookieParam", + "target": "2547" + } + ] + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Set cookies from playwright format." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2586, + "module": "sessions._cookies", + "name": "set_cookies_from_playwright_format", + "parsedDocstring": { + "text": "Set cookies from playwright format." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/sessions/_cookies.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 224 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Set cookies from playwright format." + } + ] + }, + "flags": {}, + "id": 2587, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "set_cookies_from_playwright_format", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2588, + "kind": 32768, + "kindString": "Parameter", + "name": "pw_cookies", + "type": { + "name": "list", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "PlaywrightCookieParam", + "target": "2547" + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2589, + "module": "sessions._cookies", + "name": "__deepcopy__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/sessions/_cookies.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 231 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 2590, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__deepcopy__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2591, + "kind": 32768, + "kindString": "Parameter", + "name": "memo", + "type": { + "name": "dict[int, Any] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "dict", + "typeArguments": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "reference", + "name": "Any" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "SessionCookies", + "type": "reference", + "target": "2557" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2592, + "module": "sessions._cookies", + "name": "__len__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/sessions/_cookies.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 236 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 2593, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__len__", + "parameters": [], + "type": { + "name": "int", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2594, + "module": "sessions._cookies", + "name": "__setitem__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/sessions/_cookies.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 239 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 2595, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__setitem__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2596, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2597, + "kind": 32768, + "kindString": "Parameter", + "name": "value", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2598, + "module": "sessions._cookies", + "name": "__getitem__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/sessions/_cookies.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 242 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 2599, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__getitem__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2600, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2601, + "module": "sessions._cookies", + "name": "__iter__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/sessions/_cookies.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 248 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 2602, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__iter__", + "parameters": [], + "type": { + "name": "Iterator", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "CookieParam", + "target": "2538" + } + ] + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2603, + "module": "sessions._cookies", + "name": "__repr__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/sessions/_cookies.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 251 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 2604, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__repr__", + "parameters": [], + "type": { + "name": "str", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2605, + "module": "sessions._cookies", + "name": "__bool__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/sessions/_cookies.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 257 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 2606, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__bool__", + "parameters": [], + "type": { + "name": "bool", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2607, + "module": "sessions._cookies", + "name": "__eq__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/sessions/_cookies.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 262 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 2608, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__eq__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2609, + "kind": 32768, + "kindString": "Parameter", + "name": "other", + "type": { + "name": "object", + "type": "reference" + } + } + ], + "type": { + "name": "bool", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return hash based on the cookies key attributes." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2610, + "module": "sessions._cookies", + "name": "__hash__", + "parsedDocstring": { + "text": "Return hash based on the cookies key attributes." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/sessions/_cookies.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 274 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return hash based on the cookies key attributes." + } + ] + }, + "flags": {}, + "id": 2611, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__hash__", + "parameters": [], + "type": { + "name": "int", + "type": "reference" + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Storage cookies for session with browser-compatible serialization and deserialization." + } + ] + }, + "decorations": [ + { + "args": "('Session management')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 2605, + 2589, + 2607, + 2598, + 2610, + 2558, + 2601, + 2592, + 2603, + 2594, + 2573, + 2584, + 2562, + 2581, + 2586, + 2575, + 2578 + ], + "title": "Methods" + }, + { + "children": [ + 2561 + ], + "title": "Properties" + } + ], + "id": 2557, + "module": "sessions._cookies", + "name": "SessionCookies", + "parsedDocstring": { + "text": "Storage cookies for session with browser-compatible serialization and deserialization." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/sessions/_cookies.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 60 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2613, + "module": "statistics._error_snapshotter", + "name": "MAX_ERROR_CHARACTERS", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_error_snapshotter.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 16 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2614, + "module": "statistics._error_snapshotter", + "name": "MAX_HASH_LENGTH", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_error_snapshotter.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 17 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2615, + "module": "statistics._error_snapshotter", + "name": "MAX_FILENAME_LENGTH", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_error_snapshotter.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 18 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2616, + "module": "statistics._error_snapshotter", + "name": "BASE_MESSAGE", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_error_snapshotter.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 19 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2617, + "module": "statistics._error_snapshotter", + "name": "SNAPSHOT_PREFIX", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_error_snapshotter.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 20 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2618, + "module": "statistics._error_snapshotter", + "name": "ALLOWED_CHARACTERS", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_error_snapshotter.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 21 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2619, + "module": "statistics._error_snapshotter", + "name": "__init__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_error_snapshotter.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 23 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 2620, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2621, + "kind": 32768, + "kindString": "Parameter", + "name": "snapshot_kvs_name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Capture error snapshot and save it to key value store.\n\nIt saves the error snapshot directly to a key value store. It can't use `context.get_key_value_store` because\nit returns `KeyValueStoreChangeRecords` which is committed to the key value store only if the `RequestHandler`\nreturned without an exception. ErrorSnapshotter is on the contrary active only when `RequestHandler` fails with\nan exception.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2622, + "module": "statistics._error_snapshotter", + "name": "capture_snapshot", + "parsedDocstring": { + "text": "Capture error snapshot and save it to key value store.\n\nIt saves the error snapshot directly to a key value store. It can't use `context.get_key_value_store` because\nit returns `KeyValueStoreChangeRecords` which is committed to the key value store only if the `RequestHandler`\nreturned without an exception. ErrorSnapshotter is on the contrary active only when `RequestHandler` fails with\nan exception.\n", + "args": { + "error_message": "Used in filename of the snapshot.", + "file_and_line": "Used in filename of the snapshot.", + "context": "Context that is used to get the snapshot." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_error_snapshotter.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 26 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Capture error snapshot and save it to key value store.\n\nIt saves the error snapshot directly to a key value store. It can't use `context.get_key_value_store` because\nit returns `KeyValueStoreChangeRecords` which is committed to the key value store only if the `RequestHandler`\nreturned without an exception. ErrorSnapshotter is on the contrary active only when `RequestHandler` fails with\nan exception.\n" + } + ] + }, + "flags": {}, + "id": 2623, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "capture_snapshot", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Used in filename of the snapshot." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2624, + "kind": 32768, + "kindString": "Parameter", + "name": "error_message", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Used in filename of the snapshot." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2625, + "kind": 32768, + "kindString": "Parameter", + "name": "file_and_line", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Context that is used to get the snapshot." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2626, + "kind": 32768, + "kindString": "Parameter", + "name": "context", + "type": { + "name": "BasicCrawlingContext", + "type": "reference", + "target": "504" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 2619, + 2622 + ], + "title": "Methods" + }, + { + "children": [ + 2618, + 2616, + 2613, + 2615, + 2614, + 2617 + ], + "title": "Properties" + } + ], + "id": 2612, + "module": "statistics._error_snapshotter", + "name": "ErrorSnapshotter", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_error_snapshotter.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 15 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2627, + "module": "statistics._error_tracker", + "name": "GroupName", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_error_tracker.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 16 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2628, + "module": "statistics._error_tracker", + "name": "ErrorFilenameGroups", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_error_tracker.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 17 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2629, + "module": "statistics._error_tracker", + "name": "logger", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_error_tracker.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 20 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2631, + "module": "statistics._error_tracker", + "name": "__init__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_error_tracker.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 26 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 2632, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2633, + "kind": 32768, + "kindString": "Parameter", + "name": "snapshot_kvs_name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "True", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2634, + "kind": 32768, + "kindString": "Parameter", + "name": "show_error_name", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "defaultValue": "True", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2635, + "kind": 32768, + "kindString": "Parameter", + "name": "show_file_and_line_number", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "defaultValue": "True", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2636, + "kind": 32768, + "kindString": "Parameter", + "name": "show_error_message", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2637, + "kind": 32768, + "kindString": "Parameter", + "name": "show_full_message", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2638, + "kind": 32768, + "kindString": "Parameter", + "name": "save_error_snapshots", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Add an error in the statistics.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2639, + "module": "statistics._error_tracker", + "name": "add", + "parsedDocstring": { + "text": "Add an error in the statistics.\n", + "args": { + "error": "Error to be added to statistics.", + "context": "Context used to collect error snapshot.", + "early": "Flag indicating that the error is added earlier than usual to have access to resources that will be\nclosed before normal error collection. This prevents double reporting during normal error collection." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_error_tracker.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 46 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Add an error in the statistics.\n" + } + ] + }, + "flags": {}, + "id": 2640, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "add", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Error to be added to statistics." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2641, + "kind": 32768, + "kindString": "Parameter", + "name": "error", + "type": { + "name": "Exception", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Context used to collect error snapshot." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2642, + "kind": 32768, + "kindString": "Parameter", + "name": "context", + "type": { + "name": "BasicCrawlingContext | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "BasicCrawlingContext", + "target": "504" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Flag indicating that the error is added earlier than usual to have access to resources that will be\nclosed before normal error collection. This prevents double reporting during normal error collection." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2643, + "kind": 32768, + "kindString": "Parameter", + "name": "early", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Number of distinct kinds of errors." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 2644, + "module": "statistics._error_tracker", + "name": "unique_error_count", + "parsedDocstring": { + "text": "Number of distinct kinds of errors." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_error_tracker.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 136 + } + ], + "type": { + "name": "int", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Total number of errors." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 2645, + "module": "statistics._error_tracker", + "name": "total", + "parsedDocstring": { + "text": "Total number of errors." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_error_tracker.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 145 + } + ], + "type": { + "name": "int", + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return n most common errors." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2646, + "module": "statistics._error_tracker", + "name": "get_most_common_errors", + "parsedDocstring": { + "text": "Return n most common errors." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_error_tracker.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 153 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return n most common errors." + } + ] + }, + "flags": {}, + "id": 2647, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "get_most_common_errors", + "parameters": [ + { + "defaultValue": "3", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 2648, + "kind": 32768, + "kindString": "Parameter", + "name": "n", + "type": { + "name": "int", + "type": "reference" + } + } + ], + "type": { + "name": "list", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "tuple", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + }, + { + "type": "reference", + "name": "int" + } + ] + } + ] + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Track errors and aggregates their counts by similarity." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 2631, + 2639, + 2646 + ], + "title": "Methods" + }, + { + "children": [ + 2645, + 2644 + ], + "title": "Properties" + } + ], + "id": 2630, + "module": "statistics._error_tracker", + "name": "ErrorTracker", + "parsedDocstring": { + "text": "Track errors and aggregates their counts by similarity." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_error_tracker.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 23 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2650, + "module": "statistics._models", + "name": "requests_finished", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 25 + } + ], + "type": { + "name": "int", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2651, + "module": "statistics._models", + "name": "requests_failed", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 26 + } + ], + "type": { + "name": "int", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2652, + "module": "statistics._models", + "name": "retry_histogram", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 27 + } + ], + "type": { + "name": "list", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2653, + "module": "statistics._models", + "name": "request_avg_failed_duration", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 28 + } + ], + "type": { + "name": "timedelta | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "timedelta" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2654, + "module": "statistics._models", + "name": "request_avg_finished_duration", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 29 + } + ], + "type": { + "name": "timedelta | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "timedelta" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2655, + "module": "statistics._models", + "name": "requests_finished_per_minute", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 30 + } + ], + "type": { + "name": "float", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2656, + "module": "statistics._models", + "name": "requests_failed_per_minute", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 31 + } + ], + "type": { + "name": "float", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2657, + "module": "statistics._models", + "name": "request_total_duration", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 32 + } + ], + "type": { + "name": "timedelta", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2658, + "module": "statistics._models", + "name": "requests_total", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 33 + } + ], + "type": { + "name": "int", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2659, + "module": "statistics._models", + "name": "crawler_runtime", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 34 + } + ], + "type": { + "name": "timedelta", + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Print out the Final Statistics data as a table." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2660, + "module": "statistics._models", + "name": "to_table", + "parsedDocstring": { + "text": "Print out the Final Statistics data as a table." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 36 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Print out the Final Statistics data as a table." + } + ] + }, + "flags": {}, + "id": 2661, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "to_table", + "parameters": [], + "type": { + "name": "str", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2662, + "module": "statistics._models", + "name": "to_dict", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 47 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 2663, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "to_dict", + "parameters": [], + "type": { + "name": "dict", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "union", + "types": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "float" + }, + { + "type": "reference", + "name": "int" + } + ] + }, + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + ] + } + ] + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "name": "override" + } + ], + "flags": {}, + "groups": [], + "id": 2664, + "module": "statistics._models", + "name": "__str__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 51 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 2665, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__str__", + "parameters": [], + "type": { + "name": "str", + "type": "reference" + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Statistics about a crawler run." + } + ] + }, + "decorations": [ + { + "args": "(frozen=True)", + "name": "dataclass" + }, + { + "args": "('Statistics')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 2664, + 2662, + 2660 + ], + "title": "Methods" + }, + { + "children": [ + 2659, + 2653, + 2654, + 2657, + 2651, + 2656, + 2650, + 2655, + 2658, + 2652 + ], + "title": "Properties" + } + ], + "id": 2649, + "module": "statistics._models", + "name": "FinalStatistics", + "parsedDocstring": { + "text": "Statistics about a crawler run." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 22 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2667, + "module": "statistics._models", + "name": "model_config", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 61 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2668, + "module": "statistics._models", + "name": "stats_id", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 62 + } + ], + "type": { + "name": "Annotated[int | None, Field(alias='statsId')]", + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2669, + "module": "statistics._models", + "name": "requests_finished", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 64 + } + ], + "type": { + "name": "int", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2670, + "module": "statistics._models", + "name": "requests_failed", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 65 + } + ], + "type": { + "name": "int", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2671, + "module": "statistics._models", + "name": "requests_retries", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 66 + } + ], + "type": { + "name": "int", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2672, + "module": "statistics._models", + "name": "requests_failed_per_minute", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 67 + } + ], + "type": { + "name": "float", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2673, + "module": "statistics._models", + "name": "requests_finished_per_minute", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 68 + } + ], + "type": { + "name": "float", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2674, + "module": "statistics._models", + "name": "request_min_duration", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 69 + } + ], + "type": { + "name": "Annotated[timedelta_ms | None, Field(alias='requestMinDurationMillis')]", + "type": "union", + "types": [ + { + "type": "reference", + "name": "timedelta_ms", + "target": "790" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2675, + "module": "statistics._models", + "name": "request_max_duration", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 70 + } + ], + "type": { + "name": "Annotated[timedelta_ms | None, Field(alias='requestMaxDurationMillis')]", + "type": "union", + "types": [ + { + "type": "reference", + "name": "timedelta_ms", + "target": "790" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2676, + "module": "statistics._models", + "name": "request_total_failed_duration", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 71 + } + ], + "type": { + "name": "timedelta_ms", + "type": "reference", + "target": "790" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2677, + "module": "statistics._models", + "name": "request_total_finished_duration", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 74 + } + ], + "type": { + "name": "timedelta_ms", + "type": "reference", + "target": "790" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2678, + "module": "statistics._models", + "name": "crawler_started_at", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 77 + } + ], + "type": { + "name": "Annotated[datetime | None, Field(alias='crawlerStartedAt')]", + "type": "union", + "types": [ + { + "type": "reference", + "name": "datetime" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2679, + "module": "statistics._models", + "name": "crawler_last_started_at", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 78 + } + ], + "type": { + "name": "Annotated[datetime | None, Field(alias='crawlerLastStartTimestamp')]", + "type": "union", + "types": [ + { + "type": "reference", + "name": "datetime" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2680, + "module": "statistics._models", + "name": "crawler_finished_at", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 79 + } + ], + "type": { + "name": "Annotated[datetime | None, Field(alias='crawlerFinishedAt')]", + "type": "union", + "types": [ + { + "type": "reference", + "name": "datetime" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2681, + "module": "statistics._models", + "name": "stats_persisted_at", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 94 + } + ], + "type": { + "name": "Annotated[ datetime | None, Field(alias='statsPersistedAt'), PlainSerializer(lambda _: datetime.now(timezone.utc)) ]", + "type": "union", + "types": [ + { + "type": "reference", + "name": "datetime" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2682, + "module": "statistics._models", + "name": "request_retry_histogram", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 97 + } + ], + "type": { + "name": "dict", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "reference", + "name": "int" + } + ] + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2683, + "module": "statistics._models", + "name": "model_post_init", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 110 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 2684, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "model_post_init", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2685, + "kind": 32768, + "kindString": "Parameter", + "name": "__context", + "type": { + "name": "Any", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "inheritedFrom": { + "name": "StatisticsState.model_post_init", + "target": 2683, + "type": "reference" + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 2686, + "module": "statistics._models", + "name": "crawler_runtime", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 114 + } + ], + "type": { + "name": "timedelta", + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "args": ".setter", + "name": "crawler_runtime" + } + ], + "flags": {}, + "groups": [], + "id": 2687, + "module": "statistics._models", + "name": "crawler_runtime", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 121 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 2688, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "crawler_runtime", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2689, + "kind": 32768, + "kindString": "Parameter", + "name": "value", + "type": { + "name": "timedelta", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "inheritedFrom": { + "name": "StatisticsState.crawler_runtime", + "target": 2686, + "type": "reference" + }, + "overwrites": { + "name": "StatisticsState.crawler_runtime", + "target": 2687, + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "args": "(alias='crawlerRuntimeMillis')", + "name": "computed_field" + } + ], + "flags": {}, + "groups": [], + "id": 2690, + "module": "statistics._models", + "name": "crawler_runtime_for_serialization", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 132 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 2691, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "crawler_runtime_for_serialization", + "parameters": [], + "type": { + "name": "timedelta", + "type": "reference" + }, + "inheritedFrom": { + "name": "StatisticsState.crawler_runtime_for_serialization", + "target": 2690, + "type": "reference" + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "args": "(alias='requestTotalDurationMillis', return_type=timedelta_ms)", + "name": "computed_field" + }, + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 2692, + "module": "statistics._models", + "name": "request_total_duration", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 140 + } + ], + "type": { + "name": "timedelta", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "args": "(alias='requestAvgFailedDurationMillis', return_type=timedelta_ms | None)", + "name": "computed_field" + }, + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 2693, + "module": "statistics._models", + "name": "request_avg_failed_duration", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 145 + } + ], + "type": { + "name": "timedelta | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "timedelta" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "args": "(alias='requestAvgFinishedDurationMillis', return_type=timedelta_ms | None)", + "name": "computed_field" + }, + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 2694, + "module": "statistics._models", + "name": "request_avg_finished_duration", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 150 + } + ], + "type": { + "name": "timedelta | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "timedelta" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "args": "(alias='requestsTotal')", + "name": "computed_field" + }, + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 2695, + "module": "statistics._models", + "name": "requests_total", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 155 + } + ], + "type": { + "name": "int", + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Statistic data about a crawler run." + } + ] + }, + "decorations": [ + { + "args": "('Statistics')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 2687, + 2690, + 2683 + ], + "title": "Methods" + }, + { + "children": [ + 2680, + 2679, + 2686, + 2678, + 2667, + 2693, + 2694, + 2675, + 2674, + 2682, + 2692, + 2676, + 2677, + 2670, + 2672, + 2669, + 2673, + 2671, + 2695, + 2668, + 2681 + ], + "title": "Properties" + } + ], + "id": 2666, + "module": "statistics._models", + "name": "StatisticsState", + "parsedDocstring": { + "text": "Statistic data about a crawler run." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 58 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedBy": [ + { + "name": "AdaptivePlaywrightCrawlerStatisticState", + "target": "1304", + "type": "reference" + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2696, + "module": "statistics._statistics", + "name": "TStatisticsState", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_statistics.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 26 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2697, + "module": "statistics._statistics", + "name": "TNewStatisticsState", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_statistics.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 27 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2698, + "module": "statistics._statistics", + "name": "logger", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_statistics.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 28 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2700, + "module": "statistics._statistics", + "name": "__init__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_statistics.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 34 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 2701, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Mark the job as started." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2702, + "module": "statistics._statistics", + "name": "run", + "parsedDocstring": { + "text": "Mark the job as started." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_statistics.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 39 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Mark the job as started." + } + ] + }, + "flags": {}, + "id": 2703, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "run", + "parameters": [], + "type": { + "name": "int", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Mark the job as finished." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2704, + "module": "statistics._statistics", + "name": "finish", + "parsedDocstring": { + "text": "Mark the job as finished." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_statistics.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 45 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Mark the job as finished." + } + ] + }, + "flags": {}, + "id": 2705, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "finish", + "parameters": [], + "type": { + "name": "timedelta", + "type": "reference" + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Number of times the job has been retried." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 2706, + "module": "statistics._statistics", + "name": "retry_count", + "parsedDocstring": { + "text": "Number of times the job has been retried." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_statistics.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 54 + } + ], + "type": { + "name": "int", + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Tracks information about the processing of a request." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 2700, + 2704, + 2702 + ], + "title": "Methods" + }, + { + "children": [ + 2706 + ], + "title": "Properties" + } + ], + "id": 2699, + "module": "statistics._statistics", + "name": "RequestProcessingRecord", + "parsedDocstring": { + "text": "Tracks information about the processing of a request." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_statistics.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 31 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2708, + "module": "statistics._statistics", + "name": "__init__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_statistics.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 71 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 2709, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2710, + "kind": 32768, + "kindString": "Parameter", + "name": "persistence_enabled", + "type": { + "name": "bool | Literal['explicit_only']", + "type": "union", + "types": [ + { + "type": "reference", + "name": "bool" + }, + { + "type": "reference", + "name": "Literal", + "typeArguments": [ + { + "type": "literal", + "value": "explicit_only" + } + ] + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2711, + "kind": 32768, + "kindString": "Parameter", + "name": "persist_state_kvs_name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2712, + "kind": 32768, + "kindString": "Parameter", + "name": "persist_state_key", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2713, + "kind": 32768, + "kindString": "Parameter", + "name": "persist_state_kvs_factory", + "type": { + "name": "Callable[[], Coroutine[None, None, KeyValueStore]] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Callable", + "typeArguments": [ + { + "type": "reference", + "name": "[]" + }, + { + "type": "reference", + "name": "Coroutine", + "typeArguments": [ + { + "type": "literal", + "value": null + }, + { + "type": "literal", + "value": null + }, + { + "type": "reference", + "name": "KeyValueStore", + "target": "3700" + } + ] + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "'Statistics'", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2714, + "kind": 32768, + "kindString": "Parameter", + "name": "log_message", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2715, + "kind": 32768, + "kindString": "Parameter", + "name": "periodic_message_logger", + "type": { + "name": "Logger | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Logger" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "timedelta(minutes=1)", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2716, + "kind": 32768, + "kindString": "Parameter", + "name": "log_interval", + "type": { + "name": "timedelta", + "type": "reference" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2717, + "kind": 32768, + "kindString": "Parameter", + "name": "state_model", + "type": { + "name": "type", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "TStatisticsState", + "target": "1263" + } + ], + "target": "981" + } + }, + { + "defaultValue": "'table'", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2718, + "kind": 32768, + "kindString": "Parameter", + "name": "statistics_log_format", + "type": { + "name": "Literal", + "type": "reference", + "typeArguments": [ + { + "type": "literal", + "value": "table" + }, + { + "type": "literal", + "value": "inline" + } + ] + } + }, + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2719, + "kind": 32768, + "kindString": "Parameter", + "name": "save_error_snapshots", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "Statistics.__init__", + "target": 2708, + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create near copy of the `Statistics` with replaced `state_model`." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2720, + "module": "statistics._statistics", + "name": "replace_state_model", + "parsedDocstring": { + "text": "Create near copy of the `Statistics` with replaced `state_model`." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_statistics.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 113 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create near copy of the `Statistics` with replaced `state_model`." + } + ] + }, + "flags": {}, + "id": 2721, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "replace_state_model", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2722, + "kind": 32768, + "kindString": "Parameter", + "name": "state_model", + "type": { + "name": "type", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "TNewStatisticsState", + "target": "2697" + } + ], + "target": "981" + } + } + ], + "type": { + "name": "Statistics", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "TNewStatisticsState", + "target": "2697" + } + ], + "target": "2707" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance with default state model `StatisticsState`." + } + ] + }, + "decorations": [ + { + "name": "staticmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2723, + "module": "statistics._statistics", + "name": "with_default_state", + "parsedDocstring": { + "text": "Initialize a new instance with default state model `StatisticsState`." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_statistics.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 127 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance with default state model `StatisticsState`." + } + ] + }, + "flags": {}, + "id": 2724, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "with_default_state", + "parameters": [ + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2725, + "kind": 32768, + "kindString": "Parameter", + "name": "persistence_enabled", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2726, + "kind": 32768, + "kindString": "Parameter", + "name": "persist_state_kvs_name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2727, + "kind": 32768, + "kindString": "Parameter", + "name": "persist_state_key", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2728, + "kind": 32768, + "kindString": "Parameter", + "name": "persist_state_kvs_factory", + "type": { + "name": "Callable[[], Coroutine[None, None, KeyValueStore]] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Callable", + "typeArguments": [ + { + "type": "reference", + "name": "[]" + }, + { + "type": "reference", + "name": "Coroutine", + "typeArguments": [ + { + "type": "literal", + "value": null + }, + { + "type": "literal", + "value": null + }, + { + "type": "reference", + "name": "KeyValueStore", + "target": "3700" + } + ] + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "'Statistics'", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2729, + "kind": 32768, + "kindString": "Parameter", + "name": "log_message", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2730, + "kind": 32768, + "kindString": "Parameter", + "name": "periodic_message_logger", + "type": { + "name": "Logger | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Logger" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "timedelta(minutes=1)", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2731, + "kind": 32768, + "kindString": "Parameter", + "name": "log_interval", + "type": { + "name": "timedelta", + "type": "reference" + } + }, + { + "defaultValue": "'table'", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2732, + "kind": 32768, + "kindString": "Parameter", + "name": "statistics_log_format", + "type": { + "name": "Literal", + "type": "reference", + "typeArguments": [ + { + "type": "literal", + "value": "table" + }, + { + "type": "literal", + "value": "inline" + } + ] + } + }, + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2733, + "kind": 32768, + "kindString": "Parameter", + "name": "save_error_snapshots", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "Statistics", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "StatisticsState", + "target": "2666" + } + ], + "target": "2707" + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Indicate whether the context is active." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 2734, + "module": "statistics._statistics", + "name": "active", + "parsedDocstring": { + "text": "Indicate whether the context is active." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_statistics.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 154 + } + ], + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Subscribe to events and start collecting statistics.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2735, + "module": "statistics._statistics", + "name": "__aenter__", + "parsedDocstring": { + "text": "Subscribe to events and start collecting statistics.\n" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_statistics.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 158 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Subscribe to events and start collecting statistics.\n" + } + ] + }, + "flags": {}, + "id": 2736, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "__aenter__", + "parameters": [], + "type": { + "name": "Self", + "type": "reference" + }, + "overwrites": { + "name": "Statistics.__aenter__", + "target": 2735, + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Stop collecting statistics.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2737, + "module": "statistics._statistics", + "name": "__aexit__", + "parsedDocstring": { + "text": "Stop collecting statistics.\n" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_statistics.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 180 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Stop collecting statistics.\n" + } + ] + }, + "flags": {}, + "id": 2738, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "__aexit__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2739, + "kind": 32768, + "kindString": "Parameter", + "name": "exc_type", + "type": { + "name": "type[BaseException] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "type", + "typeArguments": [ + { + "type": "reference", + "name": "BaseException" + } + ], + "target": "981" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2740, + "kind": 32768, + "kindString": "Parameter", + "name": "exc_value", + "type": { + "name": "BaseException | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "BaseException" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2741, + "kind": 32768, + "kindString": "Parameter", + "name": "exc_traceback", + "type": { + "name": "TracebackType | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "TracebackType" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "Statistics.__aexit__", + "target": 2737, + "type": "reference" + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 2742, + "module": "statistics._statistics", + "name": "state", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_statistics.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 204 + } + ], + "type": { + "name": "TStatisticsState", + "type": "reference", + "target": "1263" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Increment the number of times a status code has been received." + } + ] + }, + "decorations": [ + { + "name": "ensure_context" + } + ], + "flags": {}, + "groups": [], + "id": 2743, + "module": "statistics._statistics", + "name": "register_status_code", + "parsedDocstring": { + "text": "Increment the number of times a status code has been received." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_statistics.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 208 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Increment the number of times a status code has been received." + } + ] + }, + "flags": {}, + "id": 2744, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "register_status_code", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2745, + "kind": 32768, + "kindString": "Parameter", + "name": "code", + "type": { + "name": "int", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Mark a request as started." + } + ] + }, + "decorations": [ + { + "name": "ensure_context" + } + ], + "flags": {}, + "groups": [], + "id": 2746, + "module": "statistics._statistics", + "name": "record_request_processing_start", + "parsedDocstring": { + "text": "Mark a request as started." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_statistics.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 215 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Mark a request as started." + } + ] + }, + "flags": {}, + "id": 2747, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "record_request_processing_start", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2748, + "kind": 32768, + "kindString": "Parameter", + "name": "request_id_or_key", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Mark a request as finished." + } + ] + }, + "decorations": [ + { + "name": "ensure_context" + } + ], + "flags": {}, + "groups": [], + "id": 2749, + "module": "statistics._statistics", + "name": "record_request_processing_finish", + "parsedDocstring": { + "text": "Mark a request as finished." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_statistics.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 222 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Mark a request as finished." + } + ] + }, + "flags": {}, + "id": 2750, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "record_request_processing_finish", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2751, + "kind": 32768, + "kindString": "Parameter", + "name": "request_id_or_key", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Mark a request as failed." + } + ] + }, + "decorations": [ + { + "name": "ensure_context" + } + ], + "flags": {}, + "groups": [], + "id": 2752, + "module": "statistics._statistics", + "name": "record_request_processing_failure", + "parsedDocstring": { + "text": "Mark a request as failed." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_statistics.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 244 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Mark a request as failed." + } + ] + }, + "flags": {}, + "id": 2753, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "record_request_processing_failure", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2754, + "kind": 32768, + "kindString": "Parameter", + "name": "request_id_or_key", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Calculate the current statistics." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2755, + "module": "statistics._statistics", + "name": "calculate", + "parsedDocstring": { + "text": "Calculate the current statistics." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_statistics.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 258 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Calculate the current statistics." + } + ] + }, + "flags": {}, + "id": 2756, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "calculate", + "parameters": [], + "type": { + "name": "FinalStatistics", + "type": "reference", + "target": "2649" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Reset the statistics to their defaults and remove any persistent state." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2757, + "module": "statistics._statistics", + "name": "reset", + "parsedDocstring": { + "text": "Reset the statistics to their defaults and remove any persistent state." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_statistics.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 277 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Reset the statistics to their defaults and remove any persistent state." + } + ] + }, + "flags": {}, + "id": 2758, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "reset", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A class for collecting, tracking, and logging runtime statistics for requests.\n\nIt is designed to record information such as request durations, retries, successes, and failures, enabling\nanalysis of crawler performance. The collected statistics are persisted to a `KeyValueStore`, ensuring they\nremain available across crawler migrations, abortions, and restarts. This persistence allows for tracking\nand evaluation of crawler behavior over its lifecycle." + } + ] + }, + "decorations": [ + { + "args": "('Statistics')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 2735, + 2737, + 2708, + 2755, + 2752, + 2749, + 2746, + 2743, + 2720, + 2757, + 2723 + ], + "title": "Methods" + }, + { + "children": [ + 2734, + 2742 + ], + "title": "Properties" + } + ], + "id": 2707, + "module": "statistics._statistics", + "name": "Statistics", + "parsedDocstring": { + "text": "A class for collecting, tracking, and logging runtime statistics for requests.\n\nIt is designed to record information such as request durations, retries, successes, and failures, enabling\nanalysis of crawler performance. The collected statistics are persisted to a `KeyValueStore`, ensuring they\nremain available across crawler migrations, abortions, and restarts. This persistence allows for tracking\nand evaluation of crawler behavior over its lifecycle." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/statistics/_statistics.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 60 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedBy": [ + { + "name": "_NonPersistentStatistics", + "target": "1385", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the metadata of the request queue." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2760, + "module": "storage_clients._base._request_queue_client", + "name": "get_metadata", + "parsedDocstring": { + "text": "Get the metadata of the request queue." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_base/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 21 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the metadata of the request queue." + } + ] + }, + "flags": {}, + "id": 2761, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_metadata", + "parameters": [], + "type": { + "name": "RequestQueueMetadata", + "type": "reference", + "target": "3644" + }, + "overwrites": { + "name": "RequestQueueClient.get_metadata", + "target": 2760, + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Drop the whole request queue and remove all its values.\n\nThe backend method for the `RequestQueue.drop` call." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2762, + "module": "storage_clients._base._request_queue_client", + "name": "drop", + "parsedDocstring": { + "text": "Drop the whole request queue and remove all its values.\n\nThe backend method for the `RequestQueue.drop` call." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_base/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 25 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Drop the whole request queue and remove all its values.\n\nThe backend method for the `RequestQueue.drop` call." + } + ] + }, + "flags": {}, + "id": 2763, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "drop", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "RequestQueueClient.drop", + "target": 2762, + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Purge all items from the request queue.\n\nThe backend method for the `RequestQueue.purge` call." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2764, + "module": "storage_clients._base._request_queue_client", + "name": "purge", + "parsedDocstring": { + "text": "Purge all items from the request queue.\n\nThe backend method for the `RequestQueue.purge` call." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_base/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 32 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Purge all items from the request queue.\n\nThe backend method for the `RequestQueue.purge` call." + } + ] + }, + "flags": {}, + "id": 2765, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "purge", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "RequestQueueClient.purge", + "target": 2764, + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Add batch of requests to the queue.\n\nThis method adds a batch of requests to the queue. Each request is processed based on its uniqueness\n(determined by `unique_key`). Duplicates will be identified but not re-added to the queue.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2766, + "module": "storage_clients._base._request_queue_client", + "name": "add_batch_of_requests", + "parsedDocstring": { + "text": "Add batch of requests to the queue.\n\nThis method adds a batch of requests to the queue. Each request is processed based on its uniqueness\n(determined by `unique_key`). Duplicates will be identified but not re-added to the queue.\n", + "args": { + "requests": "The collection of requests to add to the queue.", + "forefront": "Whether to put the added requests at the beginning (True) or the end (False) of the queue.\nWhen True, the requests will be processed sooner than previously added requests.", + "batch_size": "The maximum number of requests to add in a single batch.", + "wait_time_between_batches": "The time to wait between adding batches of requests.", + "wait_for_all_requests_to_be_added": "If True, the method will wait until all requests are added\nto the queue before returning.", + "wait_for_all_requests_to_be_added_timeout": "The maximum time to wait for all requests to be added.\n" + }, + "returns": "A response object containing information about which requests were successfully\nprocessed and which failed (if any)." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_base/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 39 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "A response object containing information about which requests were successfully\nprocessed and which failed (if any)." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Add batch of requests to the queue.\n\nThis method adds a batch of requests to the queue. Each request is processed based on its uniqueness\n(determined by `unique_key`). Duplicates will be identified but not re-added to the queue.\n" + } + ] + }, + "flags": {}, + "id": 2767, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "add_batch_of_requests", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The collection of requests to add to the queue." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2768, + "kind": 32768, + "kindString": "Parameter", + "name": "requests", + "type": { + "name": "Sequence", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Request", + "target": "150" + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Whether to put the added requests at the beginning (True) or the end (False) of the queue.\nWhen True, the requests will be processed sooner than previously added requests." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2769, + "kind": 32768, + "kindString": "Parameter", + "name": "forefront", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "AddRequestsResponse", + "type": "reference", + "target": "3676" + }, + "overwrites": { + "name": "RequestQueueClient.add_batch_of_requests", + "target": 2766, + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Retrieve a request from the queue.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2770, + "module": "storage_clients._base._request_queue_client", + "name": "get_request", + "parsedDocstring": { + "text": "Retrieve a request from the queue.\n", + "args": { + "unique_key": "Unique key of the request to retrieve.\n" + }, + "returns": "The retrieved request, or None, if it did not exist." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_base/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 66 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "The retrieved request, or None, if it did not exist." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Retrieve a request from the queue.\n" + } + ] + }, + "flags": {}, + "id": 2771, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_request", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Unique key of the request to retrieve.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2772, + "kind": 32768, + "kindString": "Parameter", + "name": "unique_key", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "Request | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Request", + "target": "150" + }, + { + "type": "literal", + "value": null + } + ] + }, + "overwrites": { + "name": "RequestQueueClient.get_request", + "target": 2770, + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the next request in the queue to be processed.\n\nOnce you successfully finish processing of the request, you need to call `RequestQueue.mark_request_as_handled`\nto mark the request as handled in the queue. If there was some error in processing the request, call\n`RequestQueue.reclaim_request` instead, so that the queue will give the request to some other consumer\nin another call to the `fetch_next_request` method.\n\nNote that the `None` return value does not mean the queue processing finished, it means there are currently\nno pending requests. To check whether all requests in queue were finished, use `RequestQueue.is_finished`\ninstead.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2773, + "module": "storage_clients._base._request_queue_client", + "name": "fetch_next_request", + "parsedDocstring": { + "text": "Return the next request in the queue to be processed.\n\nOnce you successfully finish processing of the request, you need to call `RequestQueue.mark_request_as_handled`\nto mark the request as handled in the queue. If there was some error in processing the request, call\n`RequestQueue.reclaim_request` instead, so that the queue will give the request to some other consumer\nin another call to the `fetch_next_request` method.\n\nNote that the `None` return value does not mean the queue processing finished, it means there are currently\nno pending requests. To check whether all requests in queue were finished, use `RequestQueue.is_finished`\ninstead.\n", + "returns": "The request or `None` if there are no more pending requests." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_base/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 77 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "The request or `None` if there are no more pending requests." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Return the next request in the queue to be processed.\n\nOnce you successfully finish processing of the request, you need to call `RequestQueue.mark_request_as_handled`\nto mark the request as handled in the queue. If there was some error in processing the request, call\n`RequestQueue.reclaim_request` instead, so that the queue will give the request to some other consumer\nin another call to the `fetch_next_request` method.\n\nNote that the `None` return value does not mean the queue processing finished, it means there are currently\nno pending requests. To check whether all requests in queue were finished, use `RequestQueue.is_finished`\ninstead.\n" + } + ] + }, + "flags": {}, + "id": 2774, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "fetch_next_request", + "parameters": [], + "type": { + "name": "Request | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Request", + "target": "150" + }, + { + "type": "literal", + "value": null + } + ] + }, + "overwrites": { + "name": "RequestQueueClient.fetch_next_request", + "target": 2773, + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Mark a request as handled after successful processing.\n\nHandled requests will never again be returned by the `RequestQueue.fetch_next_request` method.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2775, + "module": "storage_clients._base._request_queue_client", + "name": "mark_request_as_handled", + "parsedDocstring": { + "text": "Mark a request as handled after successful processing.\n\nHandled requests will never again be returned by the `RequestQueue.fetch_next_request` method.\n", + "args": { + "request": "The request to mark as handled.\n" + }, + "returns": "Information about the queue operation. `None` if the given request was not in progress." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_base/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 94 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "Information about the queue operation. `None` if the given request was not in progress." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Mark a request as handled after successful processing.\n\nHandled requests will never again be returned by the `RequestQueue.fetch_next_request` method.\n" + } + ] + }, + "flags": {}, + "id": 2776, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "mark_request_as_handled", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The request to mark as handled.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2777, + "kind": 32768, + "kindString": "Parameter", + "name": "request", + "type": { + "name": "Request", + "type": "reference", + "target": "150" + } + } + ], + "type": { + "name": "ProcessedRequest | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ProcessedRequest", + "target": "3665" + }, + { + "type": "literal", + "value": null + } + ] + }, + "overwrites": { + "name": "RequestQueueClient.mark_request_as_handled", + "target": 2775, + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Reclaim a failed request back to the queue.\n\nThe request will be returned for processing later again by another call to `RequestQueue.fetch_next_request`.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2778, + "module": "storage_clients._base._request_queue_client", + "name": "reclaim_request", + "parsedDocstring": { + "text": "Reclaim a failed request back to the queue.\n\nThe request will be returned for processing later again by another call to `RequestQueue.fetch_next_request`.\n", + "args": { + "request": "The request to return to the queue.", + "forefront": "Whether to add the request to the head or the end of the queue.\n" + }, + "returns": "Information about the queue operation. `None` if the given request was not in progress." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_base/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 107 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "Information about the queue operation. `None` if the given request was not in progress." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Reclaim a failed request back to the queue.\n\nThe request will be returned for processing later again by another call to `RequestQueue.fetch_next_request`.\n" + } + ] + }, + "flags": {}, + "id": 2779, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "reclaim_request", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The request to return to the queue." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2780, + "kind": 32768, + "kindString": "Parameter", + "name": "request", + "type": { + "name": "Request", + "type": "reference", + "target": "150" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Whether to add the request to the head or the end of the queue.\n" + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2781, + "kind": 32768, + "kindString": "Parameter", + "name": "forefront", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "ProcessedRequest | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ProcessedRequest", + "target": "3665" + }, + { + "type": "literal", + "value": null + } + ] + }, + "overwrites": { + "name": "RequestQueueClient.reclaim_request", + "target": 2778, + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Check if the request queue is empty.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2782, + "module": "storage_clients._base._request_queue_client", + "name": "is_empty", + "parsedDocstring": { + "text": "Check if the request queue is empty.\n", + "returns": "True if the request queue is empty, False otherwise." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_base/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 126 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "True if the request queue is empty, False otherwise." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Check if the request queue is empty.\n" + } + ] + }, + "flags": {}, + "id": 2783, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "is_empty", + "parameters": [], + "type": { + "name": "bool", + "type": "reference" + }, + "overwrites": { + "name": "RequestQueueClient.is_empty", + "target": 2782, + "type": "reference" + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "An abstract class for request queue resource clients.\n\nThese clients are specific to the type of resource they manage and operate under a designated storage\nclient, like a memory storage client." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 2766, + 2762, + 2773, + 2760, + 2770, + 2782, + 2775, + 2764, + 2778 + ], + "title": "Methods" + } + ], + "id": 2759, + "module": "storage_clients._base._request_queue_client", + "name": "RequestQueueClient", + "parsedDocstring": { + "text": "An abstract class for request queue resource clients.\n\nThese clients are specific to the type of resource they manage and operate under a designated storage\nclient, like a memory storage client." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_base/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 13 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedBy": [ + { + "name": "FileSystemRequestQueueClient", + "target": "2990", + "type": "reference" + }, + { + "name": "MemoryRequestQueueClient", + "target": "3128", + "type": "reference" + }, + { + "name": "RedisRequestQueueClient", + "target": "3304", + "type": "reference" + }, + { + "name": "SqlRequestQueueClient", + "target": "3555", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return a cache key that can differentiate between different storages of this and other clients.\n\nCan be based on configuration or on the client itself. By default, returns a module and name of the client\nclass." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2785, + "module": "storage_clients._base._storage_client", + "name": "get_storage_client_cache_key", + "parsedDocstring": { + "text": "Return a cache key that can differentiate between different storages of this and other clients.\n\nCan be based on configuration or on the client itself. By default, returns a module and name of the client\nclass." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_base/_storage_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 33 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return a cache key that can differentiate between different storages of this and other clients.\n\nCan be based on configuration or on the client itself. By default, returns a module and name of the client\nclass." + } + ] + }, + "flags": {}, + "id": 2786, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "get_storage_client_cache_key", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2787, + "kind": 32768, + "kindString": "Parameter", + "name": "configuration", + "type": { + "name": "Configuration", + "type": "reference", + "target": "235" + } + } + ], + "type": { + "name": "Hashable", + "type": "reference" + }, + "overwrites": { + "name": "StorageClient.get_storage_client_cache_key", + "target": 2785, + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a dataset client." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2788, + "module": "storage_clients._base._storage_client", + "name": "create_dataset_client", + "parsedDocstring": { + "text": "Create a dataset client." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_base/_storage_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 42 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a dataset client." + } + ] + }, + "flags": {}, + "id": 2789, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "create_dataset_client", + "parameters": [ + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2790, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2791, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2792, + "kind": 32768, + "kindString": "Parameter", + "name": "alias", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2793, + "kind": 32768, + "kindString": "Parameter", + "name": "configuration", + "type": { + "name": "Configuration | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Configuration", + "target": "235" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "DatasetClient", + "type": "reference", + "target": "2808" + }, + "overwrites": { + "name": "StorageClient.create_dataset_client", + "target": 2788, + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a key-value store client." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2794, + "module": "storage_clients._base._storage_client", + "name": "create_kvs_client", + "parsedDocstring": { + "text": "Create a key-value store client." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_base/_storage_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 53 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a key-value store client." + } + ] + }, + "flags": {}, + "id": 2795, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "create_kvs_client", + "parameters": [ + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2796, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2797, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2798, + "kind": 32768, + "kindString": "Parameter", + "name": "alias", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2799, + "kind": 32768, + "kindString": "Parameter", + "name": "configuration", + "type": { + "name": "Configuration | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Configuration", + "target": "235" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "KeyValueStoreClient", + "type": "reference", + "target": "2842" + }, + "overwrites": { + "name": "StorageClient.create_kvs_client", + "target": 2794, + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a request queue client." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2800, + "module": "storage_clients._base._storage_client", + "name": "create_rq_client", + "parsedDocstring": { + "text": "Create a request queue client." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_base/_storage_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 64 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a request queue client." + } + ] + }, + "flags": {}, + "id": 2801, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "create_rq_client", + "parameters": [ + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2802, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2803, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2804, + "kind": 32768, + "kindString": "Parameter", + "name": "alias", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2805, + "kind": 32768, + "kindString": "Parameter", + "name": "configuration", + "type": { + "name": "Configuration | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Configuration", + "target": "235" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "RequestQueueClient", + "type": "reference", + "target": "2759" + }, + "overwrites": { + "name": "StorageClient.create_rq_client", + "target": 2800, + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return statistics about rate limit errors encountered by the HTTP client in storage client." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2806, + "module": "storage_clients._base._storage_client", + "name": "get_rate_limit_errors", + "parsedDocstring": { + "text": "Return statistics about rate limit errors encountered by the HTTP client in storage client." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_base/_storage_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 74 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return statistics about rate limit errors encountered by the HTTP client in storage client." + } + ] + }, + "flags": {}, + "id": 2807, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "get_rate_limit_errors", + "parameters": [], + "type": { + "name": "dict", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "reference", + "name": "int" + } + ] + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Base class for storage clients.\n\nThe `StorageClient` serves as an abstract base class that defines the interface for accessing Crawlee's\nstorage types: datasets, key-value stores, and request queues. It provides methods to open clients for\neach of these storage types and handles common functionality.\n\nStorage clients implementations can be provided for various backends (file system, memory, databases,\nvarious cloud providers, etc.) to support different use cases from development to production environments.\n\nEach storage client implementation is responsible for ensuring proper initialization, data persistence\n(where applicable), and consistent access patterns across all storage types it supports." + } + ] + }, + "decorations": [ + { + "args": "('Storage clients')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 2788, + 2794, + 2800, + 2806, + 2785 + ], + "title": "Methods" + } + ], + "id": 2784, + "module": "storage_clients._base._storage_client", + "name": "StorageClient", + "parsedDocstring": { + "text": "Base class for storage clients.\n\nThe `StorageClient` serves as an abstract base class that defines the interface for accessing Crawlee's\nstorage types: datasets, key-value stores, and request queues. It provides methods to open clients for\neach of these storage types and handles common functionality.\n\nStorage clients implementations can be provided for various backends (file system, memory, databases,\nvarious cloud providers, etc.) to support different use cases from development to production environments.\n\nEach storage client implementation is responsible for ensuring proper initialization, data persistence\n(where applicable), and consistent access patterns across all storage types it supports." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_base/_storage_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 19 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedBy": [ + { + "name": "FileSystemStorageClient", + "target": "2870", + "type": "reference" + }, + { + "name": "MemoryStorageClient", + "target": "3108", + "type": "reference" + }, + { + "name": "RedisStorageClient", + "target": "3222", + "type": "reference" + }, + { + "name": "SqlStorageClient", + "target": "3591", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the metadata of the dataset." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2809, + "module": "storage_clients._base._dataset_client", + "name": "get_metadata", + "parsedDocstring": { + "text": "Get the metadata of the dataset." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_base/_dataset_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 27 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the metadata of the dataset." + } + ] + }, + "flags": {}, + "id": 2810, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_metadata", + "parameters": [], + "type": { + "name": "DatasetMetadata", + "type": "reference", + "target": "3639" + }, + "overwrites": { + "name": "DatasetClient.get_metadata", + "target": 2809, + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Drop the whole dataset and remove all its items.\n\nThe backend method for the `Dataset.drop` call." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2811, + "module": "storage_clients._base._dataset_client", + "name": "drop", + "parsedDocstring": { + "text": "Drop the whole dataset and remove all its items.\n\nThe backend method for the `Dataset.drop` call." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_base/_dataset_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 31 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Drop the whole dataset and remove all its items.\n\nThe backend method for the `Dataset.drop` call." + } + ] + }, + "flags": {}, + "id": 2812, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "drop", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "DatasetClient.drop", + "target": 2811, + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Purge all items from the dataset.\n\nThe backend method for the `Dataset.purge` call." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2813, + "module": "storage_clients._base._dataset_client", + "name": "purge", + "parsedDocstring": { + "text": "Purge all items from the dataset.\n\nThe backend method for the `Dataset.purge` call." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_base/_dataset_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 38 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Purge all items from the dataset.\n\nThe backend method for the `Dataset.purge` call." + } + ] + }, + "flags": {}, + "id": 2814, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "purge", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "DatasetClient.purge", + "target": 2813, + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Push data to the dataset.\n\nThe backend method for the `Dataset.push_data` call." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2815, + "module": "storage_clients._base._dataset_client", + "name": "push_data", + "parsedDocstring": { + "text": "Push data to the dataset.\n\nThe backend method for the `Dataset.push_data` call." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_base/_dataset_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 45 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Push data to the dataset.\n\nThe backend method for the `Dataset.push_data` call." + } + ] + }, + "flags": {}, + "id": 2816, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "push_data", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2817, + "kind": 32768, + "kindString": "Parameter", + "name": "data", + "type": { + "name": "list[Any] | dict[str, Any]", + "type": "union", + "types": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "Any" + } + ] + }, + { + "type": "reference", + "name": "dict", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Any" + } + ] + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "DatasetClient.push_data", + "target": 2815, + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get data from the dataset with various filtering options.\n\nThe backend method for the `Dataset.get_data` call." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2818, + "module": "storage_clients._base._dataset_client", + "name": "get_data", + "parsedDocstring": { + "text": "Get data from the dataset with various filtering options.\n\nThe backend method for the `Dataset.get_data` call." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_base/_dataset_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 52 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get data from the dataset with various filtering options.\n\nThe backend method for the `Dataset.get_data` call." + } + ] + }, + "flags": {}, + "id": 2819, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_data", + "parameters": [ + { + "defaultValue": "0", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2820, + "kind": 32768, + "kindString": "Parameter", + "name": "offset", + "type": { + "name": "int", + "type": "reference" + } + }, + { + "defaultValue": "999_999_999_999", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2821, + "kind": 32768, + "kindString": "Parameter", + "name": "limit", + "type": { + "name": "int | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2822, + "kind": 32768, + "kindString": "Parameter", + "name": "clean", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2823, + "kind": 32768, + "kindString": "Parameter", + "name": "desc", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2824, + "kind": 32768, + "kindString": "Parameter", + "name": "fields", + "type": { + "name": "list[str] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2825, + "kind": 32768, + "kindString": "Parameter", + "name": "omit", + "type": { + "name": "list[str] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2826, + "kind": 32768, + "kindString": "Parameter", + "name": "unwind", + "type": { + "name": "list[str] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2827, + "kind": 32768, + "kindString": "Parameter", + "name": "skip_empty", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2828, + "kind": 32768, + "kindString": "Parameter", + "name": "skip_hidden", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2829, + "kind": 32768, + "kindString": "Parameter", + "name": "flatten", + "type": { + "name": "list[str] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2830, + "kind": 32768, + "kindString": "Parameter", + "name": "view", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "DatasetItemsListPage", + "type": "reference", + "target": "3658" + }, + "overwrites": { + "name": "DatasetClient.get_data", + "target": 2818, + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Iterate over the dataset items with filtering options.\n\nThe backend method for the `Dataset.iterate_items` call." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2831, + "module": "storage_clients._base._dataset_client", + "name": "iterate_items", + "parsedDocstring": { + "text": "Iterate over the dataset items with filtering options.\n\nThe backend method for the `Dataset.iterate_items` call." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_base/_dataset_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 73 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Iterate over the dataset items with filtering options.\n\nThe backend method for the `Dataset.iterate_items` call." + } + ] + }, + "flags": {}, + "id": 2832, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "iterate_items", + "parameters": [ + { + "defaultValue": "0", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2833, + "kind": 32768, + "kindString": "Parameter", + "name": "offset", + "type": { + "name": "int", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2834, + "kind": 32768, + "kindString": "Parameter", + "name": "limit", + "type": { + "name": "int | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2835, + "kind": 32768, + "kindString": "Parameter", + "name": "clean", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2836, + "kind": 32768, + "kindString": "Parameter", + "name": "desc", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2837, + "kind": 32768, + "kindString": "Parameter", + "name": "fields", + "type": { + "name": "list[str] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2838, + "kind": 32768, + "kindString": "Parameter", + "name": "omit", + "type": { + "name": "list[str] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2839, + "kind": 32768, + "kindString": "Parameter", + "name": "unwind", + "type": { + "name": "list[str] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2840, + "kind": 32768, + "kindString": "Parameter", + "name": "skip_empty", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2841, + "kind": 32768, + "kindString": "Parameter", + "name": "skip_hidden", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "AsyncIterator", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "dict", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Any" + } + ] + } + ] + }, + "overwrites": { + "name": "DatasetClient.iterate_items", + "target": 2831, + "type": "reference" + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "An abstract class for dataset storage clients.\n\nDataset clients provide an interface for accessing and manipulating dataset storage. They handle\noperations like adding and getting dataset items across different storage backends.\n\nStorage clients are specific to the type of storage they manage (`Dataset`, `KeyValueStore`,\n`RequestQueue`), and can operate with various storage systems including memory, file system,\ndatabases, and cloud storage solutions.\n\nThis abstract class defines the interface that all specific dataset clients must implement." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 2811, + 2818, + 2809, + 2831, + 2813, + 2815 + ], + "title": "Methods" + } + ], + "id": 2808, + "module": "storage_clients._base._dataset_client", + "name": "DatasetClient", + "parsedDocstring": { + "text": "An abstract class for dataset storage clients.\n\nDataset clients provide an interface for accessing and manipulating dataset storage. They handle\noperations like adding and getting dataset items across different storage backends.\n\nStorage clients are specific to the type of storage they manage (`Dataset`, `KeyValueStore`,\n`RequestQueue`), and can operate with various storage systems including memory, file system,\ndatabases, and cloud storage solutions.\n\nThis abstract class defines the interface that all specific dataset clients must implement." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_base/_dataset_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 13 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedBy": [ + { + "name": "FileSystemDatasetClient", + "target": "2893", + "type": "reference" + }, + { + "name": "MemoryDatasetClient", + "target": "3030", + "type": "reference" + }, + { + "name": "RedisDatasetClient", + "target": "3177", + "type": "reference" + }, + { + "name": "SqlDatasetClient", + "target": "3348", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the metadata of the key-value store." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2843, + "module": "storage_clients._base._key_value_store_client", + "name": "get_metadata", + "parsedDocstring": { + "text": "Get the metadata of the key-value store." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_base/_key_value_store_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 28 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the metadata of the key-value store." + } + ] + }, + "flags": {}, + "id": 2844, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_metadata", + "parameters": [], + "type": { + "name": "KeyValueStoreMetadata", + "type": "reference", + "target": "3642" + }, + "overwrites": { + "name": "KeyValueStoreClient.get_metadata", + "target": 2843, + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Drop the whole key-value store and remove all its values.\n\nThe backend method for the `KeyValueStore.drop` call." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2845, + "module": "storage_clients._base._key_value_store_client", + "name": "drop", + "parsedDocstring": { + "text": "Drop the whole key-value store and remove all its values.\n\nThe backend method for the `KeyValueStore.drop` call." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_base/_key_value_store_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 32 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Drop the whole key-value store and remove all its values.\n\nThe backend method for the `KeyValueStore.drop` call." + } + ] + }, + "flags": {}, + "id": 2846, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "drop", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "KeyValueStoreClient.drop", + "target": 2845, + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Purge all items from the key-value store.\n\nThe backend method for the `KeyValueStore.purge` call." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2847, + "module": "storage_clients._base._key_value_store_client", + "name": "purge", + "parsedDocstring": { + "text": "Purge all items from the key-value store.\n\nThe backend method for the `KeyValueStore.purge` call." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_base/_key_value_store_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 39 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Purge all items from the key-value store.\n\nThe backend method for the `KeyValueStore.purge` call." + } + ] + }, + "flags": {}, + "id": 2848, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "purge", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "KeyValueStoreClient.purge", + "target": 2847, + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Retrieve the given record from the key-value store.\n\nThe backend method for the `KeyValueStore.get_value` call." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2849, + "module": "storage_clients._base._key_value_store_client", + "name": "get_value", + "parsedDocstring": { + "text": "Retrieve the given record from the key-value store.\n\nThe backend method for the `KeyValueStore.get_value` call." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_base/_key_value_store_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 46 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Retrieve the given record from the key-value store.\n\nThe backend method for the `KeyValueStore.get_value` call." + } + ] + }, + "flags": {}, + "id": 2850, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_value", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2851, + "kind": 32768, + "kindString": "Parameter", + "name": "key", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "KeyValueStoreRecord | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "KeyValueStoreRecord", + "target": "3655" + }, + { + "type": "literal", + "value": null + } + ] + }, + "overwrites": { + "name": "KeyValueStoreClient.get_value", + "target": 2849, + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Set a value in the key-value store by its key.\n\nThe backend method for the `KeyValueStore.set_value` call." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2852, + "module": "storage_clients._base._key_value_store_client", + "name": "set_value", + "parsedDocstring": { + "text": "Set a value in the key-value store by its key.\n\nThe backend method for the `KeyValueStore.set_value` call." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_base/_key_value_store_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 53 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Set a value in the key-value store by its key.\n\nThe backend method for the `KeyValueStore.set_value` call." + } + ] + }, + "flags": {}, + "id": 2853, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "set_value", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2854, + "kind": 32768, + "kindString": "Parameter", + "name": "key", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2855, + "kind": 32768, + "kindString": "Parameter", + "name": "value", + "type": { + "name": "Any", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2856, + "kind": 32768, + "kindString": "Parameter", + "name": "content_type", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "KeyValueStoreClient.set_value", + "target": 2852, + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Delete a value from the key-value store by its key.\n\nThe backend method for the `KeyValueStore.delete_value` call." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2857, + "module": "storage_clients._base._key_value_store_client", + "name": "delete_value", + "parsedDocstring": { + "text": "Delete a value from the key-value store by its key.\n\nThe backend method for the `KeyValueStore.delete_value` call." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_base/_key_value_store_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 60 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Delete a value from the key-value store by its key.\n\nThe backend method for the `KeyValueStore.delete_value` call." + } + ] + }, + "flags": {}, + "id": 2858, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "delete_value", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2859, + "kind": 32768, + "kindString": "Parameter", + "name": "key", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "KeyValueStoreClient.delete_value", + "target": 2857, + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Iterate over all the existing keys in the key-value store.\n\nThe backend method for the `KeyValueStore.iterate_keys` call." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2860, + "module": "storage_clients._base._key_value_store_client", + "name": "iterate_keys", + "parsedDocstring": { + "text": "Iterate over all the existing keys in the key-value store.\n\nThe backend method for the `KeyValueStore.iterate_keys` call." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_base/_key_value_store_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 67 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Iterate over all the existing keys in the key-value store.\n\nThe backend method for the `KeyValueStore.iterate_keys` call." + } + ] + }, + "flags": {}, + "id": 2861, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "iterate_keys", + "parameters": [ + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2862, + "kind": 32768, + "kindString": "Parameter", + "name": "exclusive_start_key", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2863, + "kind": 32768, + "kindString": "Parameter", + "name": "limit", + "type": { + "name": "int | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "AsyncIterator", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "KeyValueStoreRecordMetadata", + "target": "3650" + } + ] + }, + "overwrites": { + "name": "KeyValueStoreClient.iterate_keys", + "target": 2860, + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the public URL for the given key.\n\nThe backend method for the `KeyValueStore.get_public_url` call." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2864, + "module": "storage_clients._base._key_value_store_client", + "name": "get_public_url", + "parsedDocstring": { + "text": "Get the public URL for the given key.\n\nThe backend method for the `KeyValueStore.get_public_url` call." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_base/_key_value_store_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 83 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the public URL for the given key.\n\nThe backend method for the `KeyValueStore.get_public_url` call." + } + ] + }, + "flags": {}, + "id": 2865, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_public_url", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2866, + "kind": 32768, + "kindString": "Parameter", + "name": "key", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "str", + "type": "reference" + }, + "overwrites": { + "name": "KeyValueStoreClient.get_public_url", + "target": 2864, + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Check if a record with the given key exists in the key-value store.\n\nThe backend method for the `KeyValueStore.record_exists` call.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2867, + "module": "storage_clients._base._key_value_store_client", + "name": "record_exists", + "parsedDocstring": { + "text": "Check if a record with the given key exists in the key-value store.\n\nThe backend method for the `KeyValueStore.record_exists` call.\n", + "args": { + "key": "The key to check for existence.\n" + }, + "returns": "True if a record with the given key exists, False otherwise." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_base/_key_value_store_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 90 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "True if a record with the given key exists, False otherwise." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Check if a record with the given key exists in the key-value store.\n\nThe backend method for the `KeyValueStore.record_exists` call.\n" + } + ] + }, + "flags": {}, + "id": 2868, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "record_exists", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The key to check for existence.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2869, + "kind": 32768, + "kindString": "Parameter", + "name": "key", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "bool", + "type": "reference" + }, + "overwrites": { + "name": "KeyValueStoreClient.record_exists", + "target": 2867, + "type": "reference" + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "An abstract class for key-value store (KVS) storage clients.\n\nKey-value stores clients provide an interface for accessing and manipulating KVS storage. They handle\noperations like getting, setting, deleting KVS values across different storage backends.\n\nStorage clients are specific to the type of storage they manage (`Dataset`, `KeyValueStore`,\n`RequestQueue`), and can operate with various storage systems including memory, file system,\ndatabases, and cloud storage solutions.\n\nThis abstract class defines the interface that all specific KVS clients must implement." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 2857, + 2845, + 2843, + 2864, + 2849, + 2860, + 2847, + 2867, + 2852 + ], + "title": "Methods" + } + ], + "id": 2842, + "module": "storage_clients._base._key_value_store_client", + "name": "KeyValueStoreClient", + "parsedDocstring": { + "text": "An abstract class for key-value store (KVS) storage clients.\n\nKey-value stores clients provide an interface for accessing and manipulating KVS storage. They handle\noperations like getting, setting, deleting KVS values across different storage backends.\n\nStorage clients are specific to the type of storage they manage (`Dataset`, `KeyValueStore`,\n`RequestQueue`), and can operate with various storage systems including memory, file system,\ndatabases, and cloud storage solutions.\n\nThis abstract class defines the interface that all specific KVS clients must implement." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_base/_key_value_store_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 14 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedBy": [ + { + "name": "FileSystemKeyValueStoreClient", + "target": "2941", + "type": "reference" + }, + { + "name": "MemoryKeyValueStoreClient", + "target": "3072", + "type": "reference" + }, + { + "name": "RedisKeyValueStoreClient", + "target": "3248", + "type": "reference" + }, + { + "name": "SqlKeyValueStoreClient", + "target": "3495", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return a cache key that can differentiate between different storages of this and other clients.\n\nCan be based on configuration or on the client itself. By default, returns a module and name of the client\nclass." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2871, + "module": "storage_clients._base._storage_client", + "name": "get_storage_client_cache_key", + "parsedDocstring": { + "text": "Return a cache key that can differentiate between different storages of this and other clients.\n\nCan be based on configuration or on the client itself. By default, returns a module and name of the client\nclass." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_file_system/_storage_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 38 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return a cache key that can differentiate between different storages of this and other clients.\n\nCan be based on configuration or on the client itself. By default, returns a module and name of the client\nclass." + } + ] + }, + "flags": {}, + "id": 2786, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "get_storage_client_cache_key", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2787, + "kind": 32768, + "kindString": "Parameter", + "name": "configuration", + "type": { + "name": "Configuration", + "type": "reference", + "target": "235" + } + } + ], + "type": { + "name": "Hashable", + "type": "reference" + }, + "overwrites": { + "name": "StorageClient.get_storage_client_cache_key", + "target": 2785, + "type": "reference" + } + } + ], + "overwrites": { + "name": "StorageClient.get_storage_client_cache_key", + "target": 2785, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a dataset client." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2874, + "module": "storage_clients._base._storage_client", + "name": "create_dataset_client", + "parsedDocstring": { + "text": "Create a dataset client." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_file_system/_storage_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 43 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a dataset client." + } + ] + }, + "flags": {}, + "id": 2789, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "create_dataset_client", + "parameters": [ + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2790, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2791, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2792, + "kind": 32768, + "kindString": "Parameter", + "name": "alias", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2793, + "kind": 32768, + "kindString": "Parameter", + "name": "configuration", + "type": { + "name": "Configuration | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Configuration", + "target": "235" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "DatasetClient", + "type": "reference", + "target": "2808" + }, + "overwrites": { + "name": "StorageClient.create_dataset_client", + "target": 2788, + "type": "reference" + } + } + ], + "overwrites": { + "name": "StorageClient.create_dataset_client", + "target": 2788, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a key-value store client." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2880, + "module": "storage_clients._base._storage_client", + "name": "create_kvs_client", + "parsedDocstring": { + "text": "Create a key-value store client." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_file_system/_storage_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 57 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a key-value store client." + } + ] + }, + "flags": {}, + "id": 2795, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "create_kvs_client", + "parameters": [ + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2796, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2797, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2798, + "kind": 32768, + "kindString": "Parameter", + "name": "alias", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2799, + "kind": 32768, + "kindString": "Parameter", + "name": "configuration", + "type": { + "name": "Configuration | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Configuration", + "target": "235" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "KeyValueStoreClient", + "type": "reference", + "target": "2842" + }, + "overwrites": { + "name": "StorageClient.create_kvs_client", + "target": 2794, + "type": "reference" + } + } + ], + "overwrites": { + "name": "StorageClient.create_kvs_client", + "target": 2794, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a request queue client." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2886, + "module": "storage_clients._base._storage_client", + "name": "create_rq_client", + "parsedDocstring": { + "text": "Create a request queue client." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_file_system/_storage_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 71 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a request queue client." + } + ] + }, + "flags": {}, + "id": 2801, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "create_rq_client", + "parameters": [ + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2802, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2803, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2804, + "kind": 32768, + "kindString": "Parameter", + "name": "alias", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2805, + "kind": 32768, + "kindString": "Parameter", + "name": "configuration", + "type": { + "name": "Configuration | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Configuration", + "target": "235" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "RequestQueueClient", + "type": "reference", + "target": "2759" + }, + "overwrites": { + "name": "StorageClient.create_rq_client", + "target": 2800, + "type": "reference" + } + } + ], + "overwrites": { + "name": "StorageClient.create_rq_client", + "target": 2800, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return statistics about rate limit errors encountered by the HTTP client in storage client." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3987, + "module": "storage_clients._base._storage_client", + "name": "get_rate_limit_errors", + "parsedDocstring": { + "text": "Return statistics about rate limit errors encountered by the HTTP client in storage client." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_base/_storage_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 74 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return statistics about rate limit errors encountered by the HTTP client in storage client." + } + ] + }, + "flags": {}, + "id": 2807, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "get_rate_limit_errors", + "parameters": [], + "type": { + "name": "dict[int, int]", + "type": "reference" + }, + "inheritedFrom": { + "name": "StorageClient.get_rate_limit_errors", + "target": 2806, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "StorageClient.get_rate_limit_errors", + "target": 2806, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "File system implementation of the storage client.\n\nThis storage client provides access to datasets, key-value stores, and request queues that persist data\nto the local file system. Each storage type is implemented with its own specific file system client\nthat stores data in a structured directory hierarchy.\n\nData is stored in JSON format in predictable file paths, making it easy to inspect and manipulate\nthe stored data outside of the Crawlee application if needed.\n\nAll data persists between program runs but is limited to access from the local machine\nwhere the files are stored.\n\nWarning: This storage client is not safe for concurrent access from multiple crawler processes.\nUse it only when running a single crawler process at a time." + } + ] + }, + "decorations": [ + { + "args": "('Storage clients')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 2874, + 2880, + 2886, + 3987, + 2871 + ], + "title": "Methods" + } + ], + "id": 2870, + "module": "storage_clients._file_system._storage_client", + "name": "FileSystemStorageClient", + "parsedDocstring": { + "text": "File system implementation of the storage client.\n\nThis storage client provides access to datasets, key-value stores, and request queues that persist data\nto the local file system. Each storage type is implemented with its own specific file system client\nthat stores data in a structured directory hierarchy.\n\nData is stored in JSON format in predictable file paths, making it easy to inspect and manipulate\nthe stored data outside of the Crawlee application if needed.\n\nAll data persists between program runs but is limited to access from the local machine\nwhere the files are stored.\n\nWarning: This storage client is not safe for concurrent access from multiple crawler processes.\nUse it only when running a single crawler process at a time." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_file_system/_storage_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 20 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "StorageClient", + "target": "2784", + "type": "reference" + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2892, + "module": "storage_clients._file_system._dataset_client", + "name": "logger", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_file_system/_dataset_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 26 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n\nPreferably use the `FileSystemDatasetClient.open` class method to create a new instance." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2894, + "module": "storage_clients._file_system._dataset_client", + "name": "__init__", + "parsedDocstring": { + "text": "Initialize a new instance.\n\nPreferably use the `FileSystemDatasetClient.open` class method to create a new instance." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_file_system/_dataset_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 56 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n\nPreferably use the `FileSystemDatasetClient.open` class method to create a new instance." + } + ] + }, + "flags": {}, + "id": 2895, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2896, + "kind": 32768, + "kindString": "Parameter", + "name": "metadata", + "type": { + "name": "DatasetMetadata", + "type": "reference", + "target": "3639" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2897, + "kind": 32768, + "kindString": "Parameter", + "name": "path_to_dataset", + "type": { + "name": "Path", + "type": "reference" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2898, + "kind": 32768, + "kindString": "Parameter", + "name": "lock", + "type": { + "name": "asyncio.Lock", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the metadata of the dataset." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2899, + "module": "storage_clients._base._dataset_client", + "name": "get_metadata", + "parsedDocstring": { + "text": "Get the metadata of the dataset." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_file_system/_dataset_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 76 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the metadata of the dataset." + } + ] + }, + "flags": {}, + "id": 2810, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_metadata", + "parameters": [], + "type": { + "name": "DatasetMetadata", + "type": "reference", + "target": "3639" + }, + "overwrites": { + "name": "DatasetClient.get_metadata", + "target": 2809, + "type": "reference" + } + } + ], + "overwrites": { + "name": "DatasetClient.get_metadata", + "target": 2809, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The full path to the dataset directory." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 2901, + "module": "storage_clients._file_system._dataset_client", + "name": "path_to_dataset", + "parsedDocstring": { + "text": "The full path to the dataset directory." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_file_system/_dataset_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 80 + } + ], + "type": { + "name": "Path", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The full path to the dataset metadata file." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 2902, + "module": "storage_clients._file_system._dataset_client", + "name": "path_to_metadata", + "parsedDocstring": { + "text": "The full path to the dataset metadata file." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_file_system/_dataset_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 85 + } + ], + "type": { + "name": "Path", + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Open or create a file system dataset client.\n\nThis method attempts to open an existing dataset from the file system. If a dataset with the specified ID\nor name exists, it loads the metadata from the stored files. If no existing dataset is found, a new one\nis created.\n" + } + ] + }, + "decorations": [ + { + "name": "classmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2903, + "module": "storage_clients._file_system._dataset_client", + "name": "open", + "parsedDocstring": { + "text": "Open or create a file system dataset client.\n\nThis method attempts to open an existing dataset from the file system. If a dataset with the specified ID\nor name exists, it loads the metadata from the stored files. If no existing dataset is found, a new one\nis created.\n", + "args": { + "id": "The ID of the dataset to open. If provided, searches for existing dataset by ID.", + "name": "The name of the dataset for named (global scope) storages.", + "alias": "The alias of the dataset for unnamed (run scope) storages.", + "configuration": "The configuration object containing storage directory settings.\n" + }, + "returns": "An instance for the opened or created storage client." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_file_system/_dataset_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 90 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "An instance for the opened or created storage client." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Open or create a file system dataset client.\n\nThis method attempts to open an existing dataset from the file system. If a dataset with the specified ID\nor name exists, it loads the metadata from the stored files. If no existing dataset is found, a new one\nis created.\n" + } + ] + }, + "flags": {}, + "id": 2904, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "open", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The ID of the dataset to open. If provided, searches for existing dataset by ID." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2905, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The name of the dataset for named (global scope) storages." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2906, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The alias of the dataset for unnamed (run scope) storages." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2907, + "kind": 32768, + "kindString": "Parameter", + "name": "alias", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The configuration object containing storage directory settings.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2908, + "kind": 32768, + "kindString": "Parameter", + "name": "configuration", + "type": { + "name": "Configuration", + "type": "reference", + "target": "235" + } + } + ], + "type": { + "name": "Self", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Drop the whole dataset and remove all its items.\n\nThe backend method for the `Dataset.drop` call." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2909, + "module": "storage_clients._base._dataset_client", + "name": "drop", + "parsedDocstring": { + "text": "Drop the whole dataset and remove all its items.\n\nThe backend method for the `Dataset.drop` call." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_file_system/_dataset_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 205 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Drop the whole dataset and remove all its items.\n\nThe backend method for the `Dataset.drop` call." + } + ] + }, + "flags": {}, + "id": 2812, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "drop", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "DatasetClient.drop", + "target": 2811, + "type": "reference" + } + } + ], + "overwrites": { + "name": "DatasetClient.drop", + "target": 2811, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Purge all items from the dataset.\n\nThe backend method for the `Dataset.purge` call." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2911, + "module": "storage_clients._base._dataset_client", + "name": "purge", + "parsedDocstring": { + "text": "Purge all items from the dataset.\n\nThe backend method for the `Dataset.purge` call." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_file_system/_dataset_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 211 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Purge all items from the dataset.\n\nThe backend method for the `Dataset.purge` call." + } + ] + }, + "flags": {}, + "id": 2814, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "purge", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "DatasetClient.purge", + "target": 2813, + "type": "reference" + } + } + ], + "overwrites": { + "name": "DatasetClient.purge", + "target": 2813, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Push data to the dataset.\n\nThe backend method for the `Dataset.push_data` call." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2913, + "module": "storage_clients._base._dataset_client", + "name": "push_data", + "parsedDocstring": { + "text": "Push data to the dataset.\n\nThe backend method for the `Dataset.push_data` call." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_file_system/_dataset_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 223 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Push data to the dataset.\n\nThe backend method for the `Dataset.push_data` call." + } + ] + }, + "flags": {}, + "id": 2816, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "push_data", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2817, + "kind": 32768, + "kindString": "Parameter", + "name": "data", + "type": { + "name": "list[Any] | dict[str, Any]", + "type": "union", + "types": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "Any" + } + ] + }, + { + "type": "reference", + "name": "dict", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Any" + } + ] + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "DatasetClient.push_data", + "target": 2815, + "type": "reference" + } + } + ], + "overwrites": { + "name": "DatasetClient.push_data", + "target": 2815, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get data from the dataset with various filtering options.\n\nThe backend method for the `Dataset.get_data` call." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2916, + "module": "storage_clients._base._dataset_client", + "name": "get_data", + "parsedDocstring": { + "text": "Get data from the dataset with various filtering options.\n\nThe backend method for the `Dataset.get_data` call." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_file_system/_dataset_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 242 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get data from the dataset with various filtering options.\n\nThe backend method for the `Dataset.get_data` call." + } + ] + }, + "flags": {}, + "id": 2819, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_data", + "parameters": [ + { + "defaultValue": "0", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2820, + "kind": 32768, + "kindString": "Parameter", + "name": "offset", + "type": { + "name": "int", + "type": "reference" + } + }, + { + "defaultValue": "999_999_999_999", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2821, + "kind": 32768, + "kindString": "Parameter", + "name": "limit", + "type": { + "name": "int | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2822, + "kind": 32768, + "kindString": "Parameter", + "name": "clean", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2823, + "kind": 32768, + "kindString": "Parameter", + "name": "desc", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2824, + "kind": 32768, + "kindString": "Parameter", + "name": "fields", + "type": { + "name": "list[str] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2825, + "kind": 32768, + "kindString": "Parameter", + "name": "omit", + "type": { + "name": "list[str] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2826, + "kind": 32768, + "kindString": "Parameter", + "name": "unwind", + "type": { + "name": "list[str] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2827, + "kind": 32768, + "kindString": "Parameter", + "name": "skip_empty", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2828, + "kind": 32768, + "kindString": "Parameter", + "name": "skip_hidden", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2829, + "kind": 32768, + "kindString": "Parameter", + "name": "flatten", + "type": { + "name": "list[str] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2830, + "kind": 32768, + "kindString": "Parameter", + "name": "view", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "DatasetItemsListPage", + "type": "reference", + "target": "3658" + }, + "overwrites": { + "name": "DatasetClient.get_data", + "target": 2818, + "type": "reference" + } + } + ], + "overwrites": { + "name": "DatasetClient.get_data", + "target": 2818, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Iterate over the dataset items with filtering options.\n\nThe backend method for the `Dataset.iterate_items` call." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2929, + "module": "storage_clients._base._dataset_client", + "name": "iterate_items", + "parsedDocstring": { + "text": "Iterate over the dataset items with filtering options.\n\nThe backend method for the `Dataset.iterate_items` call." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_file_system/_dataset_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 341 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Iterate over the dataset items with filtering options.\n\nThe backend method for the `Dataset.iterate_items` call." + } + ] + }, + "flags": {}, + "id": 2832, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "iterate_items", + "parameters": [ + { + "defaultValue": "0", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2833, + "kind": 32768, + "kindString": "Parameter", + "name": "offset", + "type": { + "name": "int", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2834, + "kind": 32768, + "kindString": "Parameter", + "name": "limit", + "type": { + "name": "int | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2835, + "kind": 32768, + "kindString": "Parameter", + "name": "clean", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2836, + "kind": 32768, + "kindString": "Parameter", + "name": "desc", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2837, + "kind": 32768, + "kindString": "Parameter", + "name": "fields", + "type": { + "name": "list[str] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2838, + "kind": 32768, + "kindString": "Parameter", + "name": "omit", + "type": { + "name": "list[str] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2839, + "kind": 32768, + "kindString": "Parameter", + "name": "unwind", + "type": { + "name": "list[str] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2840, + "kind": 32768, + "kindString": "Parameter", + "name": "skip_empty", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2841, + "kind": 32768, + "kindString": "Parameter", + "name": "skip_hidden", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "AsyncIterator", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "dict", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Any" + } + ] + } + ] + }, + "overwrites": { + "name": "DatasetClient.iterate_items", + "target": 2831, + "type": "reference" + } + } + ], + "overwrites": { + "name": "DatasetClient.iterate_items", + "target": 2831, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "File system implementation of the dataset client.\n\nThis client persists dataset items to the file system as individual JSON files within a structured\ndirectory hierarchy following the pattern:\n\n```\n{STORAGE_DIR}/datasets/{DATASET_ID}/{ITEM_ID}.json\n```\n\n\nEach item is stored as a separate file, which allows for durability and the ability to\nrecover after process termination. Dataset operations like filtering, sorting, and pagination are\nimplemented by processing the stored files according to the requested parameters.\n\nThis implementation is ideal for long-running crawlers where data persistence is important,\nand for development environments where you want to easily inspect the collected data between runs." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 2894, + 2909, + 2916, + 2899, + 2929, + 2903, + 2911, + 2913 + ], + "title": "Methods" + }, + { + "children": [ + 2901, + 2902 + ], + "title": "Properties" + } + ], + "id": 2893, + "module": "storage_clients._file_system._dataset_client", + "name": "FileSystemDatasetClient", + "parsedDocstring": { + "text": "File system implementation of the dataset client.\n\nThis client persists dataset items to the file system as individual JSON files within a structured\ndirectory hierarchy following the pattern:\n\n```\n{STORAGE_DIR}/datasets/{DATASET_ID}/{ITEM_ID}.json\n```\n\n\nEach item is stored as a separate file, which allows for durability and the ability to\nrecover after process termination. Dataset operations like filtering, sorting, and pagination are\nimplemented by processing the stored files according to the requested parameters.\n\nThis implementation is ideal for long-running crawlers where data persistence is important,\nand for development environments where you want to easily inspect the collected data between runs." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_file_system/_dataset_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 29 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "DatasetClient", + "target": "2808", + "type": "reference" + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2940, + "module": "storage_clients._file_system._key_value_store_client", + "name": "logger", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_file_system/_key_value_store_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 29 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n\nPreferably use the `FileSystemKeyValueStoreClient.open` class method to create a new instance." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2942, + "module": "storage_clients._file_system._key_value_store_client", + "name": "__init__", + "parsedDocstring": { + "text": "Initialize a new instance.\n\nPreferably use the `FileSystemKeyValueStoreClient.open` class method to create a new instance." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_file_system/_key_value_store_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 56 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n\nPreferably use the `FileSystemKeyValueStoreClient.open` class method to create a new instance." + } + ] + }, + "flags": {}, + "id": 2943, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2944, + "kind": 32768, + "kindString": "Parameter", + "name": "metadata", + "type": { + "name": "KeyValueStoreMetadata", + "type": "reference", + "target": "3642" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2945, + "kind": 32768, + "kindString": "Parameter", + "name": "path_to_kvs", + "type": { + "name": "Path", + "type": "reference" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2946, + "kind": 32768, + "kindString": "Parameter", + "name": "lock", + "type": { + "name": "asyncio.Lock", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the metadata of the key-value store." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2947, + "module": "storage_clients._base._key_value_store_client", + "name": "get_metadata", + "parsedDocstring": { + "text": "Get the metadata of the key-value store." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_file_system/_key_value_store_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 76 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the metadata of the key-value store." + } + ] + }, + "flags": {}, + "id": 2844, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_metadata", + "parameters": [], + "type": { + "name": "KeyValueStoreMetadata", + "type": "reference", + "target": "3642" + }, + "overwrites": { + "name": "KeyValueStoreClient.get_metadata", + "target": 2843, + "type": "reference" + } + } + ], + "overwrites": { + "name": "KeyValueStoreClient.get_metadata", + "target": 2843, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The full path to the key-value store directory." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 2949, + "module": "storage_clients._file_system._key_value_store_client", + "name": "path_to_kvs", + "parsedDocstring": { + "text": "The full path to the key-value store directory." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_file_system/_key_value_store_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 80 + } + ], + "type": { + "name": "Path", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The full path to the key-value store metadata file." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 2950, + "module": "storage_clients._file_system._key_value_store_client", + "name": "path_to_metadata", + "parsedDocstring": { + "text": "The full path to the key-value store metadata file." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_file_system/_key_value_store_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 85 + } + ], + "type": { + "name": "Path", + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Open or create a file system key-value store client.\n\nThis method attempts to open an existing key-value store from the file system. If a KVS with the specified\nID or name exists, it loads the metadata from the stored files. If no existing store is found, a new one\nis created.\n" + } + ] + }, + "decorations": [ + { + "name": "classmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2951, + "module": "storage_clients._file_system._key_value_store_client", + "name": "open", + "parsedDocstring": { + "text": "Open or create a file system key-value store client.\n\nThis method attempts to open an existing key-value store from the file system. If a KVS with the specified\nID or name exists, it loads the metadata from the stored files. If no existing store is found, a new one\nis created.\n", + "args": { + "id": "The ID of the key-value store to open. If provided, searches for existing store by ID.", + "name": "The name of the key-value store for named (global scope) storages.", + "alias": "The alias of the key-value store for unnamed (run scope) storages.", + "configuration": "The configuration object containing storage directory settings.\n" + }, + "returns": "An instance for the opened or created storage client." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_file_system/_key_value_store_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 90 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "An instance for the opened or created storage client." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Open or create a file system key-value store client.\n\nThis method attempts to open an existing key-value store from the file system. If a KVS with the specified\nID or name exists, it loads the metadata from the stored files. If no existing store is found, a new one\nis created.\n" + } + ] + }, + "flags": {}, + "id": 2952, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "open", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The ID of the key-value store to open. If provided, searches for existing store by ID." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2953, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The name of the key-value store for named (global scope) storages." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2954, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The alias of the key-value store for unnamed (run scope) storages." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2955, + "kind": 32768, + "kindString": "Parameter", + "name": "alias", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The configuration object containing storage directory settings.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2956, + "kind": 32768, + "kindString": "Parameter", + "name": "configuration", + "type": { + "name": "Configuration", + "type": "reference", + "target": "235" + } + } + ], + "type": { + "name": "Self", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Drop the whole key-value store and remove all its values.\n\nThe backend method for the `KeyValueStore.drop` call." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2957, + "module": "storage_clients._base._key_value_store_client", + "name": "drop", + "parsedDocstring": { + "text": "Drop the whole key-value store and remove all its values.\n\nThe backend method for the `KeyValueStore.drop` call." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_file_system/_key_value_store_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 204 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Drop the whole key-value store and remove all its values.\n\nThe backend method for the `KeyValueStore.drop` call." + } + ] + }, + "flags": {}, + "id": 2846, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "drop", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "KeyValueStoreClient.drop", + "target": 2845, + "type": "reference" + } + } + ], + "overwrites": { + "name": "KeyValueStoreClient.drop", + "target": 2845, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Purge all items from the key-value store.\n\nThe backend method for the `KeyValueStore.purge` call." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2959, + "module": "storage_clients._base._key_value_store_client", + "name": "purge", + "parsedDocstring": { + "text": "Purge all items from the key-value store.\n\nThe backend method for the `KeyValueStore.purge` call." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_file_system/_key_value_store_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 211 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Purge all items from the key-value store.\n\nThe backend method for the `KeyValueStore.purge` call." + } + ] + }, + "flags": {}, + "id": 2848, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "purge", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "KeyValueStoreClient.purge", + "target": 2847, + "type": "reference" + } + } + ], + "overwrites": { + "name": "KeyValueStoreClient.purge", + "target": 2847, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Retrieve the given record from the key-value store.\n\nThe backend method for the `KeyValueStore.get_value` call." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2961, + "module": "storage_clients._base._key_value_store_client", + "name": "get_value", + "parsedDocstring": { + "text": "Retrieve the given record from the key-value store.\n\nThe backend method for the `KeyValueStore.get_value` call." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_file_system/_key_value_store_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 224 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Retrieve the given record from the key-value store.\n\nThe backend method for the `KeyValueStore.get_value` call." + } + ] + }, + "flags": {}, + "id": 2850, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_value", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2851, + "kind": 32768, + "kindString": "Parameter", + "name": "key", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "KeyValueStoreRecord | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "KeyValueStoreRecord", + "target": "3655" + }, + { + "type": "literal", + "value": null + } + ] + }, + "overwrites": { + "name": "KeyValueStoreClient.get_value", + "target": 2849, + "type": "reference" + } + } + ], + "overwrites": { + "name": "KeyValueStoreClient.get_value", + "target": 2849, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Set a value in the key-value store by its key.\n\nThe backend method for the `KeyValueStore.set_value` call." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2964, + "module": "storage_clients._base._key_value_store_client", + "name": "set_value", + "parsedDocstring": { + "text": "Set a value in the key-value store by its key.\n\nThe backend method for the `KeyValueStore.set_value` call." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_file_system/_key_value_store_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 303 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Set a value in the key-value store by its key.\n\nThe backend method for the `KeyValueStore.set_value` call." + } + ] + }, + "flags": {}, + "id": 2853, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "set_value", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2854, + "kind": 32768, + "kindString": "Parameter", + "name": "key", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2855, + "kind": 32768, + "kindString": "Parameter", + "name": "value", + "type": { + "name": "Any", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2856, + "kind": 32768, + "kindString": "Parameter", + "name": "content_type", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "KeyValueStoreClient.set_value", + "target": 2852, + "type": "reference" + } + } + ], + "overwrites": { + "name": "KeyValueStoreClient.set_value", + "target": 2852, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Delete a value from the key-value store by its key.\n\nThe backend method for the `KeyValueStore.delete_value` call." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2969, + "module": "storage_clients._base._key_value_store_client", + "name": "delete_value", + "parsedDocstring": { + "text": "Delete a value from the key-value store by its key.\n\nThe backend method for the `KeyValueStore.delete_value` call." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_file_system/_key_value_store_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 344 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Delete a value from the key-value store by its key.\n\nThe backend method for the `KeyValueStore.delete_value` call." + } + ] + }, + "flags": {}, + "id": 2858, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "delete_value", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2859, + "kind": 32768, + "kindString": "Parameter", + "name": "key", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "KeyValueStoreClient.delete_value", + "target": 2857, + "type": "reference" + } + } + ], + "overwrites": { + "name": "KeyValueStoreClient.delete_value", + "target": 2857, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Iterate over all the existing keys in the key-value store.\n\nThe backend method for the `KeyValueStore.iterate_keys` call." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2972, + "module": "storage_clients._base._key_value_store_client", + "name": "iterate_keys", + "parsedDocstring": { + "text": "Iterate over all the existing keys in the key-value store.\n\nThe backend method for the `KeyValueStore.iterate_keys` call." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_file_system/_key_value_store_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 367 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Iterate over all the existing keys in the key-value store.\n\nThe backend method for the `KeyValueStore.iterate_keys` call." + } + ] + }, + "flags": {}, + "id": 2861, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "iterate_keys", + "parameters": [ + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2862, + "kind": 32768, + "kindString": "Parameter", + "name": "exclusive_start_key", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2863, + "kind": 32768, + "kindString": "Parameter", + "name": "limit", + "type": { + "name": "int | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "AsyncIterator", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "KeyValueStoreRecordMetadata", + "target": "3650" + } + ] + }, + "overwrites": { + "name": "KeyValueStoreClient.iterate_keys", + "target": 2860, + "type": "reference" + } + } + ], + "overwrites": { + "name": "KeyValueStoreClient.iterate_keys", + "target": 2860, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return a file:// URL for the given key.\n" + } + ] + }, + "decorations": [ + { + "name": "override" + } + ], + "flags": {}, + "groups": [], + "id": 2976, + "module": "storage_clients._file_system._key_value_store_client", + "name": "get_public_url", + "parsedDocstring": { + "text": "Return a file:// URL for the given key.\n", + "args": { + "key": "The key to get the public URL for.\n" + }, + "returns": "A file:// URL pointing to the file on the local filesystem." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_file_system/_key_value_store_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 428 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "A file:// URL pointing to the file on the local filesystem." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Return a file:// URL for the given key.\n" + } + ] + }, + "flags": {}, + "id": 2977, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_public_url", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The key to get the public URL for.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2978, + "kind": 32768, + "kindString": "Parameter", + "name": "key", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "str", + "type": "reference" + }, + "overwrites": { + "name": "KeyValueStoreClient.get_public_url", + "target": 2864, + "type": "reference" + } + } + ], + "overwrites": { + "name": "KeyValueStoreClient.get_public_url", + "target": 2864, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Check if a record with the given key exists in the key-value store.\n" + } + ] + }, + "decorations": [ + { + "name": "override" + } + ], + "flags": {}, + "groups": [], + "id": 2979, + "module": "storage_clients._file_system._key_value_store_client", + "name": "record_exists", + "parsedDocstring": { + "text": "Check if a record with the given key exists in the key-value store.\n", + "args": { + "key": "The key to check for existence.\n" + }, + "returns": "True if a record with the given key exists, False otherwise." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_file_system/_key_value_store_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 442 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "True if a record with the given key exists, False otherwise." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Check if a record with the given key exists in the key-value store.\n" + } + ] + }, + "flags": {}, + "id": 2980, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "record_exists", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The key to check for existence.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2981, + "kind": 32768, + "kindString": "Parameter", + "name": "key", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "bool", + "type": "reference" + }, + "overwrites": { + "name": "KeyValueStoreClient.record_exists", + "target": 2867, + "type": "reference" + } + } + ], + "overwrites": { + "name": "KeyValueStoreClient.record_exists", + "target": 2867, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "File system implementation of the key-value store client.\n\nThis client persists data to the file system, making it suitable for scenarios where data needs to\nsurvive process restarts. Keys are mapped to file paths in a directory structure following the pattern:\n\n```\n{STORAGE_DIR}/key_value_stores/{STORE_ID}/{KEY}\n```\n\n\nBinary data is stored as-is, while JSON and text data are stored in human-readable format.\nThe implementation automatically handles serialization based on the content type and\nmaintains metadata about each record.\n\nThis implementation is ideal for long-running crawlers where persistence is important and\nfor development environments where you want to easily inspect the stored data between runs." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 2942, + 2969, + 2957, + 2947, + 2976, + 2961, + 2972, + 2951, + 2959, + 2979, + 2964 + ], + "title": "Methods" + }, + { + "children": [ + 2949, + 2950 + ], + "title": "Properties" + } + ], + "id": 2941, + "module": "storage_clients._file_system._key_value_store_client", + "name": "FileSystemKeyValueStoreClient", + "parsedDocstring": { + "text": "File system implementation of the key-value store client.\n\nThis client persists data to the file system, making it suitable for scenarios where data needs to\nsurvive process restarts. Keys are mapped to file paths in a directory structure following the pattern:\n\n```\n{STORAGE_DIR}/key_value_stores/{STORE_ID}/{KEY}\n```\n\n\nBinary data is stored as-is, while JSON and text data are stored in human-readable format.\nThe implementation automatically handles serialization based on the content type and\nmaintains metadata about each record.\n\nThis implementation is ideal for long-running crawlers where persistence is important and\nfor development environments where you want to easily inspect the stored data between runs." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_file_system/_key_value_store_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 32 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "KeyValueStoreClient", + "target": "2842", + "type": "reference" + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 2982, + "module": "storage_clients._file_system._request_queue_client", + "name": "logger", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_file_system/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 37 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Counter for regular request ordering." + } + ] + }, + "flags": {}, + "groups": [], + "id": 2984, + "module": "storage_clients._file_system._request_queue_client", + "name": "sequence_counter", + "parsedDocstring": { + "text": "Counter for regular request ordering." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_file_system/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 43 + } + ], + "type": { + "name": "int", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Counter for forefront request ordering." + } + ] + }, + "flags": {}, + "groups": [], + "id": 2985, + "module": "storage_clients._file_system._request_queue_client", + "name": "forefront_sequence_counter", + "parsedDocstring": { + "text": "Counter for forefront request ordering." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_file_system/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 46 + } + ], + "type": { + "name": "int", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Mapping of forefront request unique keys to their sequence numbers." + } + ] + }, + "flags": {}, + "groups": [], + "id": 2986, + "module": "storage_clients._file_system._request_queue_client", + "name": "forefront_requests", + "parsedDocstring": { + "text": "Mapping of forefront request unique keys to their sequence numbers." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_file_system/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 49 + } + ], + "type": { + "name": "dict", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "int" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Mapping of regular request unique keys to their sequence numbers." + } + ] + }, + "flags": {}, + "groups": [], + "id": 2987, + "module": "storage_clients._file_system._request_queue_client", + "name": "regular_requests", + "parsedDocstring": { + "text": "Mapping of regular request unique keys to their sequence numbers." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_file_system/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 52 + } + ], + "type": { + "name": "dict", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "int" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Set of request unique keys currently being processed." + } + ] + }, + "flags": {}, + "groups": [], + "id": 2988, + "module": "storage_clients._file_system._request_queue_client", + "name": "in_progress_requests", + "parsedDocstring": { + "text": "Set of request unique keys currently being processed." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_file_system/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 55 + } + ], + "type": { + "name": "set", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ], + "target": "2562" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Set of request unique keys that have been handled." + } + ] + }, + "flags": {}, + "groups": [], + "id": 2989, + "module": "storage_clients._file_system._request_queue_client", + "name": "handled_requests", + "parsedDocstring": { + "text": "Set of request unique keys that have been handled." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_file_system/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 58 + } + ], + "type": { + "name": "set", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ], + "target": "2562" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "State model for the `FileSystemRequestQueueClient`." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 2986, + 2985, + 2989, + 2988, + 2987, + 2984 + ], + "title": "Properties" + } + ], + "id": 2983, + "module": "storage_clients._file_system._request_queue_client", + "name": "RequestQueueState", + "parsedDocstring": { + "text": "State model for the `FileSystemRequestQueueClient`." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_file_system/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 40 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n\nPreferably use the `FileSystemRequestQueueClient.open` class method to create a new instance." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 2991, + "module": "storage_clients._file_system._request_queue_client", + "name": "__init__", + "parsedDocstring": { + "text": "Initialize a new instance.\n\nPreferably use the `FileSystemRequestQueueClient.open` class method to create a new instance." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_file_system/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 91 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n\nPreferably use the `FileSystemRequestQueueClient.open` class method to create a new instance." + } + ] + }, + "flags": {}, + "id": 2992, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2993, + "kind": 32768, + "kindString": "Parameter", + "name": "metadata", + "type": { + "name": "RequestQueueMetadata", + "type": "reference", + "target": "3644" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2994, + "kind": 32768, + "kindString": "Parameter", + "name": "path_to_rq", + "type": { + "name": "Path", + "type": "reference" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2995, + "kind": 32768, + "kindString": "Parameter", + "name": "lock", + "type": { + "name": "asyncio.Lock", + "type": "reference" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2996, + "kind": 32768, + "kindString": "Parameter", + "name": "recoverable_state", + "type": { + "name": "RecoverableState", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "RequestQueueState", + "target": "2983" + } + ], + "target": "797" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the metadata of the request queue." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 2997, + "module": "storage_clients._base._request_queue_client", + "name": "get_metadata", + "parsedDocstring": { + "text": "Get the metadata of the request queue." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_file_system/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 124 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the metadata of the request queue." + } + ] + }, + "flags": {}, + "id": 2761, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_metadata", + "parameters": [], + "type": { + "name": "RequestQueueMetadata", + "type": "reference", + "target": "3644" + }, + "overwrites": { + "name": "RequestQueueClient.get_metadata", + "target": 2760, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RequestQueueClient.get_metadata", + "target": 2760, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The full path to the request queue directory." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 2999, + "module": "storage_clients._file_system._request_queue_client", + "name": "path_to_rq", + "parsedDocstring": { + "text": "The full path to the request queue directory." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_file_system/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 128 + } + ], + "type": { + "name": "Path", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The full path to the request queue metadata file." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 3000, + "module": "storage_clients._file_system._request_queue_client", + "name": "path_to_metadata", + "parsedDocstring": { + "text": "The full path to the request queue metadata file." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_file_system/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 133 + } + ], + "type": { + "name": "Path", + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Open or create a file system request queue client.\n\nThis method attempts to open an existing request queue from the file system. If a queue with the specified\nID or name exists, it loads the metadata and state from the stored files. If no existing queue is found,\na new one is created.\n" + } + ] + }, + "decorations": [ + { + "name": "classmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3001, + "module": "storage_clients._file_system._request_queue_client", + "name": "open", + "parsedDocstring": { + "text": "Open or create a file system request queue client.\n\nThis method attempts to open an existing request queue from the file system. If a queue with the specified\nID or name exists, it loads the metadata and state from the stored files. If no existing queue is found,\na new one is created.\n", + "args": { + "id": "The ID of the request queue to open. If provided, searches for existing queue by ID.", + "name": "The name of the request queue for named (global scope) storages.", + "alias": "The alias of the request queue for unnamed (run scope) storages.", + "configuration": "The configuration object containing storage directory settings.\n" + }, + "returns": "An instance for the opened or created storage client." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_file_system/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 154 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "An instance for the opened or created storage client." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Open or create a file system request queue client.\n\nThis method attempts to open an existing request queue from the file system. If a queue with the specified\nID or name exists, it loads the metadata and state from the stored files. If no existing queue is found,\na new one is created.\n" + } + ] + }, + "flags": {}, + "id": 3002, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "open", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The ID of the request queue to open. If provided, searches for existing queue by ID." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 3003, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The name of the request queue for named (global scope) storages." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 3004, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The alias of the request queue for unnamed (run scope) storages." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 3005, + "kind": 32768, + "kindString": "Parameter", + "name": "alias", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The configuration object containing storage directory settings.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 3006, + "kind": 32768, + "kindString": "Parameter", + "name": "configuration", + "type": { + "name": "Configuration", + "type": "reference", + "target": "235" + } + } + ], + "type": { + "name": "Self", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Drop the whole request queue and remove all its values.\n\nThe backend method for the `RequestQueue.drop` call." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3007, + "module": "storage_clients._base._request_queue_client", + "name": "drop", + "parsedDocstring": { + "text": "Drop the whole request queue and remove all its values.\n\nThe backend method for the `RequestQueue.drop` call." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_file_system/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 283 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Drop the whole request queue and remove all its values.\n\nThe backend method for the `RequestQueue.drop` call." + } + ] + }, + "flags": {}, + "id": 2763, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "drop", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "RequestQueueClient.drop", + "target": 2762, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RequestQueueClient.drop", + "target": 2762, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Purge all items from the request queue.\n\nThe backend method for the `RequestQueue.purge` call." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3009, + "module": "storage_clients._base._request_queue_client", + "name": "purge", + "parsedDocstring": { + "text": "Purge all items from the request queue.\n\nThe backend method for the `RequestQueue.purge` call." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_file_system/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 299 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Purge all items from the request queue.\n\nThe backend method for the `RequestQueue.purge` call." + } + ] + }, + "flags": {}, + "id": 2765, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "purge", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "RequestQueueClient.purge", + "target": 2764, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RequestQueueClient.purge", + "target": 2764, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Add batch of requests to the queue.\n\nThis method adds a batch of requests to the queue. Each request is processed based on its uniqueness\n(determined by `unique_key`). Duplicates will be identified but not re-added to the queue.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3011, + "module": "storage_clients._base._request_queue_client", + "name": "add_batch_of_requests", + "parsedDocstring": { + "text": "Add batch of requests to the queue.\n\nThis method adds a batch of requests to the queue. Each request is processed based on its uniqueness\n(determined by `unique_key`). Duplicates will be identified but not re-added to the queue.\n", + "args": { + "requests": "The collection of requests to add to the queue.", + "forefront": "Whether to put the added requests at the beginning (True) or the end (False) of the queue.\nWhen True, the requests will be processed sooner than previously added requests.", + "batch_size": "The maximum number of requests to add in a single batch.", + "wait_time_between_batches": "The time to wait between adding batches of requests.", + "wait_for_all_requests_to_be_added": "If True, the method will wait until all requests are added\nto the queue before returning.", + "wait_for_all_requests_to_be_added_timeout": "The maximum time to wait for all requests to be added.\n" + }, + "returns": "A response object containing information about which requests were successfully\nprocessed and which failed (if any)." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_file_system/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 323 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "A response object containing information about which requests were successfully\nprocessed and which failed (if any)." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Add batch of requests to the queue.\n\nThis method adds a batch of requests to the queue. Each request is processed based on its uniqueness\n(determined by `unique_key`). Duplicates will be identified but not re-added to the queue.\n" + } + ] + }, + "flags": {}, + "id": 2767, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "add_batch_of_requests", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The collection of requests to add to the queue." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2768, + "kind": 32768, + "kindString": "Parameter", + "name": "requests", + "type": { + "name": "Sequence", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Request", + "target": "150" + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Whether to put the added requests at the beginning (True) or the end (False) of the queue.\nWhen True, the requests will be processed sooner than previously added requests." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2769, + "kind": 32768, + "kindString": "Parameter", + "name": "forefront", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "AddRequestsResponse", + "type": "reference", + "target": "3676" + }, + "overwrites": { + "name": "RequestQueueClient.add_batch_of_requests", + "target": 2766, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RequestQueueClient.add_batch_of_requests", + "target": 2766, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Retrieve a request from the queue.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3015, + "module": "storage_clients._base._request_queue_client", + "name": "get_request", + "parsedDocstring": { + "text": "Retrieve a request from the queue.\n", + "args": { + "unique_key": "Unique key of the request to retrieve.\n" + }, + "returns": "The retrieved request, or None, if it did not exist." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_file_system/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 448 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "The retrieved request, or None, if it did not exist." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Retrieve a request from the queue.\n" + } + ] + }, + "flags": {}, + "id": 2771, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_request", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Unique key of the request to retrieve.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2772, + "kind": 32768, + "kindString": "Parameter", + "name": "unique_key", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "Request | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Request", + "target": "150" + }, + { + "type": "literal", + "value": null + } + ] + }, + "overwrites": { + "name": "RequestQueueClient.get_request", + "target": 2770, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RequestQueueClient.get_request", + "target": 2770, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the next request in the queue to be processed.\n\nOnce you successfully finish processing of the request, you need to call `RequestQueue.mark_request_as_handled`\nto mark the request as handled in the queue. If there was some error in processing the request, call\n`RequestQueue.reclaim_request` instead, so that the queue will give the request to some other consumer\nin another call to the `fetch_next_request` method.\n\nNote that the `None` return value does not mean the queue processing finished, it means there are currently\nno pending requests. To check whether all requests in queue were finished, use `RequestQueue.is_finished`\ninstead.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3018, + "module": "storage_clients._base._request_queue_client", + "name": "fetch_next_request", + "parsedDocstring": { + "text": "Return the next request in the queue to be processed.\n\nOnce you successfully finish processing of the request, you need to call `RequestQueue.mark_request_as_handled`\nto mark the request as handled in the queue. If there was some error in processing the request, call\n`RequestQueue.reclaim_request` instead, so that the queue will give the request to some other consumer\nin another call to the `fetch_next_request` method.\n\nNote that the `None` return value does not mean the queue processing finished, it means there are currently\nno pending requests. To check whether all requests in queue were finished, use `RequestQueue.is_finished`\ninstead.\n", + "returns": "The request or `None` if there are no more pending requests." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_file_system/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 461 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "The request or `None` if there are no more pending requests." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Return the next request in the queue to be processed.\n\nOnce you successfully finish processing of the request, you need to call `RequestQueue.mark_request_as_handled`\nto mark the request as handled in the queue. If there was some error in processing the request, call\n`RequestQueue.reclaim_request` instead, so that the queue will give the request to some other consumer\nin another call to the `fetch_next_request` method.\n\nNote that the `None` return value does not mean the queue processing finished, it means there are currently\nno pending requests. To check whether all requests in queue were finished, use `RequestQueue.is_finished`\ninstead.\n" + } + ] + }, + "flags": {}, + "id": 2774, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "fetch_next_request", + "parameters": [], + "type": { + "name": "Request | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Request", + "target": "150" + }, + { + "type": "literal", + "value": null + } + ] + }, + "overwrites": { + "name": "RequestQueueClient.fetch_next_request", + "target": 2773, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RequestQueueClient.fetch_next_request", + "target": 2773, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Mark a request as handled after successful processing.\n\nHandled requests will never again be returned by the `RequestQueue.fetch_next_request` method.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3020, + "module": "storage_clients._base._request_queue_client", + "name": "mark_request_as_handled", + "parsedDocstring": { + "text": "Mark a request as handled after successful processing.\n\nHandled requests will never again be returned by the `RequestQueue.fetch_next_request` method.\n", + "args": { + "request": "The request to mark as handled.\n" + }, + "returns": "Information about the queue operation. `None` if the given request was not in progress." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_file_system/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 484 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "Information about the queue operation. `None` if the given request was not in progress." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Mark a request as handled after successful processing.\n\nHandled requests will never again be returned by the `RequestQueue.fetch_next_request` method.\n" + } + ] + }, + "flags": {}, + "id": 2776, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "mark_request_as_handled", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The request to mark as handled.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2777, + "kind": 32768, + "kindString": "Parameter", + "name": "request", + "type": { + "name": "Request", + "type": "reference", + "target": "150" + } + } + ], + "type": { + "name": "ProcessedRequest | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ProcessedRequest", + "target": "3665" + }, + { + "type": "literal", + "value": null + } + ] + }, + "overwrites": { + "name": "RequestQueueClient.mark_request_as_handled", + "target": 2775, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RequestQueueClient.mark_request_as_handled", + "target": 2775, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Reclaim a failed request back to the queue.\n\nThe request will be returned for processing later again by another call to `RequestQueue.fetch_next_request`.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3023, + "module": "storage_clients._base._request_queue_client", + "name": "reclaim_request", + "parsedDocstring": { + "text": "Reclaim a failed request back to the queue.\n\nThe request will be returned for processing later again by another call to `RequestQueue.fetch_next_request`.\n", + "args": { + "request": "The request to return to the queue.", + "forefront": "Whether to add the request to the head or the end of the queue.\n" + }, + "returns": "Information about the queue operation. `None` if the given request was not in progress." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_file_system/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 527 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "Information about the queue operation. `None` if the given request was not in progress." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Reclaim a failed request back to the queue.\n\nThe request will be returned for processing later again by another call to `RequestQueue.fetch_next_request`.\n" + } + ] + }, + "flags": {}, + "id": 2779, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "reclaim_request", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The request to return to the queue." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2780, + "kind": 32768, + "kindString": "Parameter", + "name": "request", + "type": { + "name": "Request", + "type": "reference", + "target": "150" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Whether to add the request to the head or the end of the queue.\n" + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2781, + "kind": 32768, + "kindString": "Parameter", + "name": "forefront", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "ProcessedRequest | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ProcessedRequest", + "target": "3665" + }, + { + "type": "literal", + "value": null + } + ] + }, + "overwrites": { + "name": "RequestQueueClient.reclaim_request", + "target": 2778, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RequestQueueClient.reclaim_request", + "target": 2778, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Check if the request queue is empty.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3027, + "module": "storage_clients._base._request_queue_client", + "name": "is_empty", + "parsedDocstring": { + "text": "Check if the request queue is empty.\n", + "returns": "True if the request queue is empty, False otherwise." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_file_system/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 588 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "True if the request queue is empty, False otherwise." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Check if the request queue is empty.\n" + } + ] + }, + "flags": {}, + "id": 2783, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "is_empty", + "parameters": [], + "type": { + "name": "bool", + "type": "reference" + }, + "overwrites": { + "name": "RequestQueueClient.is_empty", + "target": 2782, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RequestQueueClient.is_empty", + "target": 2782, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A file system implementation of the request queue client.\n\nThis client persists requests to the file system as individual JSON files, making it suitable for scenarios\nwhere data needs to survive process restarts. Each request is stored as a separate file in a directory\nstructure following the pattern:\n\n```\n{STORAGE_DIR}/request_queues/{QUEUE_ID}/{REQUEST_ID}.json\n```\n\n\nThe implementation uses `RecoverableState` to maintain ordering information, in-progress status, and\nrequest handling status. This allows for proper state recovery across process restarts without\nembedding metadata in individual request files. File system storage provides durability at the cost of\nslower I/O operations compared to memory only-based storage.\n\nThis implementation is ideal for long-running crawlers where persistence is important and for situations\nwhere you need to resume crawling after process termination." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 2991, + 3011, + 3007, + 3018, + 2997, + 3015, + 3027, + 3020, + 3001, + 3009, + 3023 + ], + "title": "Methods" + }, + { + "children": [ + 3000, + 2999 + ], + "title": "Properties" + } + ], + "id": 2990, + "module": "storage_clients._file_system._request_queue_client", + "name": "FileSystemRequestQueueClient", + "parsedDocstring": { + "text": "A file system implementation of the request queue client.\n\nThis client persists requests to the file system as individual JSON files, making it suitable for scenarios\nwhere data needs to survive process restarts. Each request is stored as a separate file in a directory\nstructure following the pattern:\n\n```\n{STORAGE_DIR}/request_queues/{QUEUE_ID}/{REQUEST_ID}.json\n```\n\n\nThe implementation uses `RecoverableState` to maintain ordering information, in-progress status, and\nrequest handling status. This allows for proper state recovery across process restarts without\nembedding metadata in individual request files. File system storage provides durability at the cost of\nslower I/O operations compared to memory only-based storage.\n\nThis implementation is ideal for long-running crawlers where persistence is important and for situations\nwhere you need to resume crawling after process termination." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_file_system/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 62 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "RequestQueueClient", + "target": "2759", + "type": "reference" + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3029, + "module": "storage_clients._memory._dataset_client", + "name": "logger", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_memory/_dataset_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 17 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n\nPreferably use the `MemoryDatasetClient.open` class method to create a new instance." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3031, + "module": "storage_clients._memory._dataset_client", + "name": "__init__", + "parsedDocstring": { + "text": "Initialize a new instance.\n\nPreferably use the `MemoryDatasetClient.open` class method to create a new instance." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_memory/_dataset_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 33 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n\nPreferably use the `MemoryDatasetClient.open` class method to create a new instance." + } + ] + }, + "flags": {}, + "id": 3032, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 3033, + "kind": 32768, + "kindString": "Parameter", + "name": "metadata", + "type": { + "name": "DatasetMetadata", + "type": "reference", + "target": "3639" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the metadata of the dataset." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3034, + "module": "storage_clients._base._dataset_client", + "name": "get_metadata", + "parsedDocstring": { + "text": "Get the metadata of the dataset." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_memory/_dataset_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 48 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the metadata of the dataset." + } + ] + }, + "flags": {}, + "id": 2810, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_metadata", + "parameters": [], + "type": { + "name": "DatasetMetadata", + "type": "reference", + "target": "3639" + }, + "overwrites": { + "name": "DatasetClient.get_metadata", + "target": 2809, + "type": "reference" + } + } + ], + "overwrites": { + "name": "DatasetClient.get_metadata", + "target": 2809, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Open or create a new memory dataset client.\n\nThis method creates a new in-memory dataset instance. Unlike persistent storage implementations, memory\ndatasets don't check for existing datasets with the same name or ID since all data exists only in memory\nand is lost when the process terminates.\n\nAlias does not have any effect on the memory storage client implementation, because unnamed storages\nare supported by default, since data are not persisted.\n" + } + ] + }, + "decorations": [ + { + "name": "classmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3036, + "module": "storage_clients._memory._dataset_client", + "name": "open", + "parsedDocstring": { + "text": "Open or create a new memory dataset client.\n\nThis method creates a new in-memory dataset instance. Unlike persistent storage implementations, memory\ndatasets don't check for existing datasets with the same name or ID since all data exists only in memory\nand is lost when the process terminates.\n\nAlias does not have any effect on the memory storage client implementation, because unnamed storages\nare supported by default, since data are not persisted.\n", + "args": { + "id": "The ID of the dataset. If not provided, a random ID will be generated.", + "name": "The name of the dataset for named (global scope) storages.", + "alias": "The alias of the dataset for unnamed (run scope) storages.\n" + }, + "returns": "An instance for the opened or created storage client." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_memory/_dataset_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 52 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "An instance for the opened or created storage client." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Open or create a new memory dataset client.\n\nThis method creates a new in-memory dataset instance. Unlike persistent storage implementations, memory\ndatasets don't check for existing datasets with the same name or ID since all data exists only in memory\nand is lost when the process terminates.\n\nAlias does not have any effect on the memory storage client implementation, because unnamed storages\nare supported by default, since data are not persisted.\n" + } + ] + }, + "flags": {}, + "id": 3037, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "open", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The ID of the dataset. If not provided, a random ID will be generated." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 3038, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The name of the dataset for named (global scope) storages." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 3039, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The alias of the dataset for unnamed (run scope) storages.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 3040, + "kind": 32768, + "kindString": "Parameter", + "name": "alias", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "Self", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Drop the whole dataset and remove all its items.\n\nThe backend method for the `Dataset.drop` call." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3041, + "module": "storage_clients._base._dataset_client", + "name": "drop", + "parsedDocstring": { + "text": "Drop the whole dataset and remove all its items.\n\nThe backend method for the `Dataset.drop` call." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_memory/_dataset_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 98 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Drop the whole dataset and remove all its items.\n\nThe backend method for the `Dataset.drop` call." + } + ] + }, + "flags": {}, + "id": 2812, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "drop", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "DatasetClient.drop", + "target": 2811, + "type": "reference" + } + } + ], + "overwrites": { + "name": "DatasetClient.drop", + "target": 2811, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Purge all items from the dataset.\n\nThe backend method for the `Dataset.purge` call." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3043, + "module": "storage_clients._base._dataset_client", + "name": "purge", + "parsedDocstring": { + "text": "Purge all items from the dataset.\n\nThe backend method for the `Dataset.purge` call." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_memory/_dataset_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 107 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Purge all items from the dataset.\n\nThe backend method for the `Dataset.purge` call." + } + ] + }, + "flags": {}, + "id": 2814, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "purge", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "DatasetClient.purge", + "target": 2813, + "type": "reference" + } + } + ], + "overwrites": { + "name": "DatasetClient.purge", + "target": 2813, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Push data to the dataset.\n\nThe backend method for the `Dataset.push_data` call." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3045, + "module": "storage_clients._base._dataset_client", + "name": "push_data", + "parsedDocstring": { + "text": "Push data to the dataset.\n\nThe backend method for the `Dataset.push_data` call." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_memory/_dataset_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 116 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Push data to the dataset.\n\nThe backend method for the `Dataset.push_data` call." + } + ] + }, + "flags": {}, + "id": 2816, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "push_data", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2817, + "kind": 32768, + "kindString": "Parameter", + "name": "data", + "type": { + "name": "list[Any] | dict[str, Any]", + "type": "union", + "types": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "Any" + } + ] + }, + { + "type": "reference", + "name": "dict", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Any" + } + ] + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "DatasetClient.push_data", + "target": 2815, + "type": "reference" + } + } + ], + "overwrites": { + "name": "DatasetClient.push_data", + "target": 2815, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get data from the dataset with various filtering options.\n\nThe backend method for the `Dataset.get_data` call." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3048, + "module": "storage_clients._base._dataset_client", + "name": "get_data", + "parsedDocstring": { + "text": "Get data from the dataset with various filtering options.\n\nThe backend method for the `Dataset.get_data` call." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_memory/_dataset_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 135 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get data from the dataset with various filtering options.\n\nThe backend method for the `Dataset.get_data` call." + } + ] + }, + "flags": {}, + "id": 2819, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_data", + "parameters": [ + { + "defaultValue": "0", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2820, + "kind": 32768, + "kindString": "Parameter", + "name": "offset", + "type": { + "name": "int", + "type": "reference" + } + }, + { + "defaultValue": "999_999_999_999", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2821, + "kind": 32768, + "kindString": "Parameter", + "name": "limit", + "type": { + "name": "int | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2822, + "kind": 32768, + "kindString": "Parameter", + "name": "clean", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2823, + "kind": 32768, + "kindString": "Parameter", + "name": "desc", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2824, + "kind": 32768, + "kindString": "Parameter", + "name": "fields", + "type": { + "name": "list[str] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2825, + "kind": 32768, + "kindString": "Parameter", + "name": "omit", + "type": { + "name": "list[str] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2826, + "kind": 32768, + "kindString": "Parameter", + "name": "unwind", + "type": { + "name": "list[str] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2827, + "kind": 32768, + "kindString": "Parameter", + "name": "skip_empty", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2828, + "kind": 32768, + "kindString": "Parameter", + "name": "skip_hidden", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2829, + "kind": 32768, + "kindString": "Parameter", + "name": "flatten", + "type": { + "name": "list[str] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2830, + "kind": 32768, + "kindString": "Parameter", + "name": "view", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "DatasetItemsListPage", + "type": "reference", + "target": "3658" + }, + "overwrites": { + "name": "DatasetClient.get_data", + "target": 2818, + "type": "reference" + } + } + ], + "overwrites": { + "name": "DatasetClient.get_data", + "target": 2818, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Iterate over the dataset items with filtering options.\n\nThe backend method for the `Dataset.iterate_items` call." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3061, + "module": "storage_clients._base._dataset_client", + "name": "iterate_items", + "parsedDocstring": { + "text": "Iterate over the dataset items with filtering options.\n\nThe backend method for the `Dataset.iterate_items` call." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_memory/_dataset_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 194 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Iterate over the dataset items with filtering options.\n\nThe backend method for the `Dataset.iterate_items` call." + } + ] + }, + "flags": {}, + "id": 2832, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "iterate_items", + "parameters": [ + { + "defaultValue": "0", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2833, + "kind": 32768, + "kindString": "Parameter", + "name": "offset", + "type": { + "name": "int", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2834, + "kind": 32768, + "kindString": "Parameter", + "name": "limit", + "type": { + "name": "int | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2835, + "kind": 32768, + "kindString": "Parameter", + "name": "clean", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2836, + "kind": 32768, + "kindString": "Parameter", + "name": "desc", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2837, + "kind": 32768, + "kindString": "Parameter", + "name": "fields", + "type": { + "name": "list[str] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2838, + "kind": 32768, + "kindString": "Parameter", + "name": "omit", + "type": { + "name": "list[str] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2839, + "kind": 32768, + "kindString": "Parameter", + "name": "unwind", + "type": { + "name": "list[str] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2840, + "kind": 32768, + "kindString": "Parameter", + "name": "skip_empty", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2841, + "kind": 32768, + "kindString": "Parameter", + "name": "skip_hidden", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "AsyncIterator", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "dict", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Any" + } + ] + } + ] + }, + "overwrites": { + "name": "DatasetClient.iterate_items", + "target": 2831, + "type": "reference" + } + } + ], + "overwrites": { + "name": "DatasetClient.iterate_items", + "target": 2831, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Memory implementation of the dataset client.\n\nThis client stores dataset items in memory using Python lists and dictionaries. No data is persisted\nbetween process runs, meaning all stored data is lost when the program terminates. This implementation\nis primarily useful for testing, development, and short-lived crawler operations where persistent\nstorage is not required.\n\nThe memory implementation provides fast access to data but is limited by available memory and\ndoes not support data sharing across different processes. It supports all dataset operations including\nsorting, filtering, and pagination, but performs them entirely in memory." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 3031, + 3041, + 3048, + 3034, + 3061, + 3036, + 3043, + 3045 + ], + "title": "Methods" + } + ], + "id": 3030, + "module": "storage_clients._memory._dataset_client", + "name": "MemoryDatasetClient", + "parsedDocstring": { + "text": "Memory implementation of the dataset client.\n\nThis client stores dataset items in memory using Python lists and dictionaries. No data is persisted\nbetween process runs, meaning all stored data is lost when the program terminates. This implementation\nis primarily useful for testing, development, and short-lived crawler operations where persistent\nstorage is not required.\n\nThe memory implementation provides fast access to data but is limited by available memory and\ndoes not support data sharing across different processes. It supports all dataset operations including\nsorting, filtering, and pagination, but performs them entirely in memory." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_memory/_dataset_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 20 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "DatasetClient", + "target": "2808", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n\nPreferably use the `MemoryKeyValueStoreClient.open` class method to create a new instance." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3073, + "module": "storage_clients._memory._key_value_store_client", + "name": "__init__", + "parsedDocstring": { + "text": "Initialize a new instance.\n\nPreferably use the `MemoryKeyValueStoreClient.open` class method to create a new instance." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_memory/_key_value_store_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 31 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n\nPreferably use the `MemoryKeyValueStoreClient.open` class method to create a new instance." + } + ] + }, + "flags": {}, + "id": 3074, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 3075, + "kind": 32768, + "kindString": "Parameter", + "name": "metadata", + "type": { + "name": "KeyValueStoreMetadata", + "type": "reference", + "target": "3642" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the metadata of the key-value store." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3076, + "module": "storage_clients._base._key_value_store_client", + "name": "get_metadata", + "parsedDocstring": { + "text": "Get the metadata of the key-value store." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_memory/_key_value_store_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 46 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the metadata of the key-value store." + } + ] + }, + "flags": {}, + "id": 2844, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_metadata", + "parameters": [], + "type": { + "name": "KeyValueStoreMetadata", + "type": "reference", + "target": "3642" + }, + "overwrites": { + "name": "KeyValueStoreClient.get_metadata", + "target": 2843, + "type": "reference" + } + } + ], + "overwrites": { + "name": "KeyValueStoreClient.get_metadata", + "target": 2843, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Open or create a new memory key-value store client.\n\nThis method creates a new in-memory key-value store instance. Unlike persistent storage implementations,\nmemory KVS don't check for existing stores with the same name or ID since all data exists only in memory\nand is lost when the process terminates.\n\nAlias does not have any effect on the memory storage client implementation, because unnamed storages\nare supported by default, since data are not persisted.\n" + } + ] + }, + "decorations": [ + { + "name": "classmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3078, + "module": "storage_clients._memory._key_value_store_client", + "name": "open", + "parsedDocstring": { + "text": "Open or create a new memory key-value store client.\n\nThis method creates a new in-memory key-value store instance. Unlike persistent storage implementations,\nmemory KVS don't check for existing stores with the same name or ID since all data exists only in memory\nand is lost when the process terminates.\n\nAlias does not have any effect on the memory storage client implementation, because unnamed storages\nare supported by default, since data are not persisted.\n", + "args": { + "id": "The ID of the key-value store. If not provided, a random ID will be generated.", + "name": "The name of the key-value store for named (global scope) storages.", + "alias": "The alias of the key-value store for unnamed (run scope) storages.\n" + }, + "returns": "An instance for the opened or created storage client." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_memory/_key_value_store_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 50 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "An instance for the opened or created storage client." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Open or create a new memory key-value store client.\n\nThis method creates a new in-memory key-value store instance. Unlike persistent storage implementations,\nmemory KVS don't check for existing stores with the same name or ID since all data exists only in memory\nand is lost when the process terminates.\n\nAlias does not have any effect on the memory storage client implementation, because unnamed storages\nare supported by default, since data are not persisted.\n" + } + ] + }, + "flags": {}, + "id": 3079, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "open", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The ID of the key-value store. If not provided, a random ID will be generated." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 3080, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The name of the key-value store for named (global scope) storages." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 3081, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The alias of the key-value store for unnamed (run scope) storages.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 3082, + "kind": 32768, + "kindString": "Parameter", + "name": "alias", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "Self", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Drop the whole key-value store and remove all its values.\n\nThe backend method for the `KeyValueStore.drop` call." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3083, + "module": "storage_clients._base._key_value_store_client", + "name": "drop", + "parsedDocstring": { + "text": "Drop the whole key-value store and remove all its values.\n\nThe backend method for the `KeyValueStore.drop` call." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_memory/_key_value_store_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 95 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Drop the whole key-value store and remove all its values.\n\nThe backend method for the `KeyValueStore.drop` call." + } + ] + }, + "flags": {}, + "id": 2846, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "drop", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "KeyValueStoreClient.drop", + "target": 2845, + "type": "reference" + } + } + ], + "overwrites": { + "name": "KeyValueStoreClient.drop", + "target": 2845, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Purge all items from the key-value store.\n\nThe backend method for the `KeyValueStore.purge` call." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3085, + "module": "storage_clients._base._key_value_store_client", + "name": "purge", + "parsedDocstring": { + "text": "Purge all items from the key-value store.\n\nThe backend method for the `KeyValueStore.purge` call." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_memory/_key_value_store_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 100 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Purge all items from the key-value store.\n\nThe backend method for the `KeyValueStore.purge` call." + } + ] + }, + "flags": {}, + "id": 2848, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "purge", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "KeyValueStoreClient.purge", + "target": 2847, + "type": "reference" + } + } + ], + "overwrites": { + "name": "KeyValueStoreClient.purge", + "target": 2847, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Retrieve the given record from the key-value store.\n\nThe backend method for the `KeyValueStore.get_value` call." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3087, + "module": "storage_clients._base._key_value_store_client", + "name": "get_value", + "parsedDocstring": { + "text": "Retrieve the given record from the key-value store.\n\nThe backend method for the `KeyValueStore.get_value` call." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_memory/_key_value_store_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 105 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Retrieve the given record from the key-value store.\n\nThe backend method for the `KeyValueStore.get_value` call." + } + ] + }, + "flags": {}, + "id": 2850, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_value", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2851, + "kind": 32768, + "kindString": "Parameter", + "name": "key", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "KeyValueStoreRecord | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "KeyValueStoreRecord", + "target": "3655" + }, + { + "type": "literal", + "value": null + } + ] + }, + "overwrites": { + "name": "KeyValueStoreClient.get_value", + "target": 2849, + "type": "reference" + } + } + ], + "overwrites": { + "name": "KeyValueStoreClient.get_value", + "target": 2849, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Set a value in the key-value store by its key.\n\nThe backend method for the `KeyValueStore.set_value` call." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3090, + "module": "storage_clients._base._key_value_store_client", + "name": "set_value", + "parsedDocstring": { + "text": "Set a value in the key-value store by its key.\n\nThe backend method for the `KeyValueStore.set_value` call." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_memory/_key_value_store_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 112 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Set a value in the key-value store by its key.\n\nThe backend method for the `KeyValueStore.set_value` call." + } + ] + }, + "flags": {}, + "id": 2853, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "set_value", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2854, + "kind": 32768, + "kindString": "Parameter", + "name": "key", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2855, + "kind": 32768, + "kindString": "Parameter", + "name": "value", + "type": { + "name": "Any", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2856, + "kind": 32768, + "kindString": "Parameter", + "name": "content_type", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "KeyValueStoreClient.set_value", + "target": 2852, + "type": "reference" + } + } + ], + "overwrites": { + "name": "KeyValueStoreClient.set_value", + "target": 2852, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Delete a value from the key-value store by its key.\n\nThe backend method for the `KeyValueStore.delete_value` call." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3095, + "module": "storage_clients._base._key_value_store_client", + "name": "delete_value", + "parsedDocstring": { + "text": "Delete a value from the key-value store by its key.\n\nThe backend method for the `KeyValueStore.delete_value` call." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_memory/_key_value_store_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 129 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Delete a value from the key-value store by its key.\n\nThe backend method for the `KeyValueStore.delete_value` call." + } + ] + }, + "flags": {}, + "id": 2858, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "delete_value", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2859, + "kind": 32768, + "kindString": "Parameter", + "name": "key", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "KeyValueStoreClient.delete_value", + "target": 2857, + "type": "reference" + } + } + ], + "overwrites": { + "name": "KeyValueStoreClient.delete_value", + "target": 2857, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Iterate over all the existing keys in the key-value store.\n\nThe backend method for the `KeyValueStore.iterate_keys` call." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3098, + "module": "storage_clients._base._key_value_store_client", + "name": "iterate_keys", + "parsedDocstring": { + "text": "Iterate over all the existing keys in the key-value store.\n\nThe backend method for the `KeyValueStore.iterate_keys` call." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_memory/_key_value_store_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 135 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Iterate over all the existing keys in the key-value store.\n\nThe backend method for the `KeyValueStore.iterate_keys` call." + } + ] + }, + "flags": {}, + "id": 2861, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "iterate_keys", + "parameters": [ + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2862, + "kind": 32768, + "kindString": "Parameter", + "name": "exclusive_start_key", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2863, + "kind": 32768, + "kindString": "Parameter", + "name": "limit", + "type": { + "name": "int | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "AsyncIterator", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "KeyValueStoreRecordMetadata", + "target": "3650" + } + ] + }, + "overwrites": { + "name": "KeyValueStoreClient.iterate_keys", + "target": 2860, + "type": "reference" + } + } + ], + "overwrites": { + "name": "KeyValueStoreClient.iterate_keys", + "target": 2860, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the public URL for the given key.\n\nThe backend method for the `KeyValueStore.get_public_url` call." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3102, + "module": "storage_clients._base._key_value_store_client", + "name": "get_public_url", + "parsedDocstring": { + "text": "Get the public URL for the given key.\n\nThe backend method for the `KeyValueStore.get_public_url` call." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_memory/_key_value_store_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 164 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the public URL for the given key.\n\nThe backend method for the `KeyValueStore.get_public_url` call." + } + ] + }, + "flags": {}, + "id": 2865, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_public_url", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2866, + "kind": 32768, + "kindString": "Parameter", + "name": "key", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "str", + "type": "reference" + }, + "overwrites": { + "name": "KeyValueStoreClient.get_public_url", + "target": 2864, + "type": "reference" + } + } + ], + "overwrites": { + "name": "KeyValueStoreClient.get_public_url", + "target": 2864, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Check if a record with the given key exists in the key-value store.\n\nThe backend method for the `KeyValueStore.record_exists` call.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3105, + "module": "storage_clients._base._key_value_store_client", + "name": "record_exists", + "parsedDocstring": { + "text": "Check if a record with the given key exists in the key-value store.\n\nThe backend method for the `KeyValueStore.record_exists` call.\n", + "args": { + "key": "The key to check for existence.\n" + }, + "returns": "True if a record with the given key exists, False otherwise." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_memory/_key_value_store_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 168 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "True if a record with the given key exists, False otherwise." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Check if a record with the given key exists in the key-value store.\n\nThe backend method for the `KeyValueStore.record_exists` call.\n" + } + ] + }, + "flags": {}, + "id": 2868, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "record_exists", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The key to check for existence.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2869, + "kind": 32768, + "kindString": "Parameter", + "name": "key", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "bool", + "type": "reference" + }, + "overwrites": { + "name": "KeyValueStoreClient.record_exists", + "target": 2867, + "type": "reference" + } + } + ], + "overwrites": { + "name": "KeyValueStoreClient.record_exists", + "target": 2867, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Memory implementation of the key-value store client.\n\nThis client stores data in memory as Python dictionaries. No data is persisted between\nprocess runs, meaning all stored data is lost when the program terminates. This implementation\nis primarily useful for testing, development, and short-lived crawler operations where\npersistence is not required.\n\nThe memory implementation provides fast access to data but is limited by available memory and\ndoes not support data sharing across different processes." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 3073, + 3095, + 3083, + 3076, + 3102, + 3087, + 3098, + 3078, + 3085, + 3105, + 3090 + ], + "title": "Methods" + } + ], + "id": 3072, + "module": "storage_clients._memory._key_value_store_client", + "name": "MemoryKeyValueStoreClient", + "parsedDocstring": { + "text": "Memory implementation of the key-value store client.\n\nThis client stores data in memory as Python dictionaries. No data is persisted between\nprocess runs, meaning all stored data is lost when the program terminates. This implementation\nis primarily useful for testing, development, and short-lived crawler operations where\npersistence is not required.\n\nThe memory implementation provides fast access to data but is limited by available memory and\ndoes not support data sharing across different processes." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_memory/_key_value_store_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 19 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "KeyValueStoreClient", + "target": "2842", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a dataset client." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3109, + "module": "storage_clients._base._storage_client", + "name": "create_dataset_client", + "parsedDocstring": { + "text": "Create a dataset client." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_memory/_storage_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 31 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a dataset client." + } + ] + }, + "flags": {}, + "id": 2789, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "create_dataset_client", + "parameters": [ + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2790, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2791, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2792, + "kind": 32768, + "kindString": "Parameter", + "name": "alias", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2793, + "kind": 32768, + "kindString": "Parameter", + "name": "configuration", + "type": { + "name": "Configuration | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Configuration", + "target": "235" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "DatasetClient", + "type": "reference", + "target": "2808" + }, + "overwrites": { + "name": "StorageClient.create_dataset_client", + "target": 2788, + "type": "reference" + } + } + ], + "overwrites": { + "name": "StorageClient.create_dataset_client", + "target": 2788, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a key-value store client." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3115, + "module": "storage_clients._base._storage_client", + "name": "create_kvs_client", + "parsedDocstring": { + "text": "Create a key-value store client." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_memory/_storage_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 45 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a key-value store client." + } + ] + }, + "flags": {}, + "id": 2795, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "create_kvs_client", + "parameters": [ + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2796, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2797, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2798, + "kind": 32768, + "kindString": "Parameter", + "name": "alias", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2799, + "kind": 32768, + "kindString": "Parameter", + "name": "configuration", + "type": { + "name": "Configuration | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Configuration", + "target": "235" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "KeyValueStoreClient", + "type": "reference", + "target": "2842" + }, + "overwrites": { + "name": "StorageClient.create_kvs_client", + "target": 2794, + "type": "reference" + } + } + ], + "overwrites": { + "name": "StorageClient.create_kvs_client", + "target": 2794, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a request queue client." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3121, + "module": "storage_clients._base._storage_client", + "name": "create_rq_client", + "parsedDocstring": { + "text": "Create a request queue client." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_memory/_storage_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 59 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a request queue client." + } + ] + }, + "flags": {}, + "id": 2801, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "create_rq_client", + "parameters": [ + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2802, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2803, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2804, + "kind": 32768, + "kindString": "Parameter", + "name": "alias", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2805, + "kind": 32768, + "kindString": "Parameter", + "name": "configuration", + "type": { + "name": "Configuration | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Configuration", + "target": "235" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "RequestQueueClient", + "type": "reference", + "target": "2759" + }, + "overwrites": { + "name": "StorageClient.create_rq_client", + "target": 2800, + "type": "reference" + } + } + ], + "overwrites": { + "name": "StorageClient.create_rq_client", + "target": 2800, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return a cache key that can differentiate between different storages of this and other clients.\n\nCan be based on configuration or on the client itself. By default, returns a module and name of the client\nclass." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3988, + "module": "storage_clients._base._storage_client", + "name": "get_storage_client_cache_key", + "parsedDocstring": { + "text": "Return a cache key that can differentiate between different storages of this and other clients.\n\nCan be based on configuration or on the client itself. By default, returns a module and name of the client\nclass." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_base/_storage_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 33 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return a cache key that can differentiate between different storages of this and other clients.\n\nCan be based on configuration or on the client itself. By default, returns a module and name of the client\nclass." + } + ] + }, + "flags": {}, + "id": 2786, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "get_storage_client_cache_key", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2787, + "kind": 32768, + "kindString": "Parameter", + "name": "configuration", + "type": { + "name": "Configuration", + "type": "reference", + "target": "235" + } + } + ], + "type": { + "name": "Hashable", + "type": "reference" + }, + "inheritedFrom": { + "name": "StorageClient.get_storage_client_cache_key", + "target": 2785, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "StorageClient.get_storage_client_cache_key", + "target": 2785, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return statistics about rate limit errors encountered by the HTTP client in storage client." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3989, + "module": "storage_clients._base._storage_client", + "name": "get_rate_limit_errors", + "parsedDocstring": { + "text": "Return statistics about rate limit errors encountered by the HTTP client in storage client." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_base/_storage_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 74 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return statistics about rate limit errors encountered by the HTTP client in storage client." + } + ] + }, + "flags": {}, + "id": 2807, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "get_rate_limit_errors", + "parameters": [], + "type": { + "name": "dict[int, int]", + "type": "reference" + }, + "inheritedFrom": { + "name": "StorageClient.get_rate_limit_errors", + "target": 2806, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "StorageClient.get_rate_limit_errors", + "target": 2806, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Memory implementation of the storage client.\n\nThis storage client provides access to datasets, key-value stores, and request queues that store all data\nin memory using Python data structures (lists and dictionaries). No data is persisted between process runs,\nmeaning all stored data is lost when the program terminates.\n\nThe memory implementation provides fast access to data but is limited by available memory and does not\nsupport data sharing across different processes. All storage operations happen entirely in memory with\nno disk operations.\n\nThe memory storage client is useful for testing and development environments, or short-lived crawler\noperations where persistence is not required." + } + ] + }, + "decorations": [ + { + "args": "('Storage clients')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 3109, + 3115, + 3121, + 3989, + 3988 + ], + "title": "Methods" + } + ], + "id": 3108, + "module": "storage_clients._memory._storage_client", + "name": "MemoryStorageClient", + "parsedDocstring": { + "text": "Memory implementation of the storage client.\n\nThis storage client provides access to datasets, key-value stores, and request queues that store all data\nin memory using Python data structures (lists and dictionaries). No data is persisted between process runs,\nmeaning all stored data is lost when the program terminates.\n\nThe memory implementation provides fast access to data but is limited by available memory and does not\nsupport data sharing across different processes. All storage operations happen entirely in memory with\nno disk operations.\n\nThe memory storage client is useful for testing and development environments, or short-lived crawler\noperations where persistence is not required." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_memory/_storage_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 15 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "StorageClient", + "target": "2784", + "type": "reference" + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3127, + "module": "storage_clients._memory._request_queue_client", + "name": "logger", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_memory/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 20 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n\nPreferably use the `MemoryRequestQueueClient.open` class method to create a new instance." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3129, + "module": "storage_clients._memory._request_queue_client", + "name": "__init__", + "parsedDocstring": { + "text": "Initialize a new instance.\n\nPreferably use the `MemoryRequestQueueClient.open` class method to create a new instance." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_memory/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 34 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n\nPreferably use the `MemoryRequestQueueClient.open` class method to create a new instance." + } + ] + }, + "flags": {}, + "id": 3130, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 3131, + "kind": 32768, + "kindString": "Parameter", + "name": "metadata", + "type": { + "name": "RequestQueueMetadata", + "type": "reference", + "target": "3644" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the metadata of the request queue." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3132, + "module": "storage_clients._base._request_queue_client", + "name": "get_metadata", + "parsedDocstring": { + "text": "Get the metadata of the request queue." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_memory/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 58 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the metadata of the request queue." + } + ] + }, + "flags": {}, + "id": 2761, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_metadata", + "parameters": [], + "type": { + "name": "RequestQueueMetadata", + "type": "reference", + "target": "3644" + }, + "overwrites": { + "name": "RequestQueueClient.get_metadata", + "target": 2760, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RequestQueueClient.get_metadata", + "target": 2760, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Open or create a new memory request queue client.\n\nThis method creates a new in-memory request queue instance. Unlike persistent storage implementations,\nmemory queues don't check for existing queues with the same name or ID since all data exists only\nin memory and is lost when the process terminates.\n\nAlias does not have any effect on the memory storage client implementation, because unnamed storages\nare supported by default, since data are not persisted.\n" + } + ] + }, + "decorations": [ + { + "name": "classmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3134, + "module": "storage_clients._memory._request_queue_client", + "name": "open", + "parsedDocstring": { + "text": "Open or create a new memory request queue client.\n\nThis method creates a new in-memory request queue instance. Unlike persistent storage implementations,\nmemory queues don't check for existing queues with the same name or ID since all data exists only\nin memory and is lost when the process terminates.\n\nAlias does not have any effect on the memory storage client implementation, because unnamed storages\nare supported by default, since data are not persisted.\n", + "args": { + "id": "The ID of the request queue. If not provided, a random ID will be generated.", + "name": "The name of the request queue for named (global scope) storages.", + "alias": "The alias of the request queue for unnamed (run scope) storages.\n" + }, + "returns": "An instance for the opened or created storage client." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_memory/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 62 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "An instance for the opened or created storage client." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Open or create a new memory request queue client.\n\nThis method creates a new in-memory request queue instance. Unlike persistent storage implementations,\nmemory queues don't check for existing queues with the same name or ID since all data exists only\nin memory and is lost when the process terminates.\n\nAlias does not have any effect on the memory storage client implementation, because unnamed storages\nare supported by default, since data are not persisted.\n" + } + ] + }, + "flags": {}, + "id": 3135, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "open", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The ID of the request queue. If not provided, a random ID will be generated." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 3136, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The name of the request queue for named (global scope) storages." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 3137, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The alias of the request queue for unnamed (run scope) storages.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 3138, + "kind": 32768, + "kindString": "Parameter", + "name": "alias", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "Self", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Drop the whole request queue and remove all its values.\n\nThe backend method for the `RequestQueue.drop` call." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3139, + "module": "storage_clients._base._request_queue_client", + "name": "drop", + "parsedDocstring": { + "text": "Drop the whole request queue and remove all its values.\n\nThe backend method for the `RequestQueue.drop` call." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_memory/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 111 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Drop the whole request queue and remove all its values.\n\nThe backend method for the `RequestQueue.drop` call." + } + ] + }, + "flags": {}, + "id": 2763, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "drop", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "RequestQueueClient.drop", + "target": 2762, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RequestQueueClient.drop", + "target": 2762, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Purge all items from the request queue.\n\nThe backend method for the `RequestQueue.purge` call." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3141, + "module": "storage_clients._base._request_queue_client", + "name": "purge", + "parsedDocstring": { + "text": "Purge all items from the request queue.\n\nThe backend method for the `RequestQueue.purge` call." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_memory/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 126 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Purge all items from the request queue.\n\nThe backend method for the `RequestQueue.purge` call." + } + ] + }, + "flags": {}, + "id": 2765, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "purge", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "RequestQueueClient.purge", + "target": 2764, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RequestQueueClient.purge", + "target": 2764, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Add batch of requests to the queue.\n\nThis method adds a batch of requests to the queue. Each request is processed based on its uniqueness\n(determined by `unique_key`). Duplicates will be identified but not re-added to the queue.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3143, + "module": "storage_clients._base._request_queue_client", + "name": "add_batch_of_requests", + "parsedDocstring": { + "text": "Add batch of requests to the queue.\n\nThis method adds a batch of requests to the queue. Each request is processed based on its uniqueness\n(determined by `unique_key`). Duplicates will be identified but not re-added to the queue.\n", + "args": { + "requests": "The collection of requests to add to the queue.", + "forefront": "Whether to put the added requests at the beginning (True) or the end (False) of the queue.\nWhen True, the requests will be processed sooner than previously added requests.", + "batch_size": "The maximum number of requests to add in a single batch.", + "wait_time_between_batches": "The time to wait between adding batches of requests.", + "wait_for_all_requests_to_be_added": "If True, the method will wait until all requests are added\nto the queue before returning.", + "wait_for_all_requests_to_be_added_timeout": "The maximum time to wait for all requests to be added.\n" + }, + "returns": "A response object containing information about which requests were successfully\nprocessed and which failed (if any)." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_memory/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 141 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "A response object containing information about which requests were successfully\nprocessed and which failed (if any)." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Add batch of requests to the queue.\n\nThis method adds a batch of requests to the queue. Each request is processed based on its uniqueness\n(determined by `unique_key`). Duplicates will be identified but not re-added to the queue.\n" + } + ] + }, + "flags": {}, + "id": 2767, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "add_batch_of_requests", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The collection of requests to add to the queue." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2768, + "kind": 32768, + "kindString": "Parameter", + "name": "requests", + "type": { + "name": "Sequence", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Request", + "target": "150" + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Whether to put the added requests at the beginning (True) or the end (False) of the queue.\nWhen True, the requests will be processed sooner than previously added requests." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2769, + "kind": 32768, + "kindString": "Parameter", + "name": "forefront", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "AddRequestsResponse", + "type": "reference", + "target": "3676" + }, + "overwrites": { + "name": "RequestQueueClient.add_batch_of_requests", + "target": 2766, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RequestQueueClient.add_batch_of_requests", + "target": 2766, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the next request in the queue to be processed.\n\nOnce you successfully finish processing of the request, you need to call `RequestQueue.mark_request_as_handled`\nto mark the request as handled in the queue. If there was some error in processing the request, call\n`RequestQueue.reclaim_request` instead, so that the queue will give the request to some other consumer\nin another call to the `fetch_next_request` method.\n\nNote that the `None` return value does not mean the queue processing finished, it means there are currently\nno pending requests. To check whether all requests in queue were finished, use `RequestQueue.is_finished`\ninstead.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3147, + "module": "storage_clients._base._request_queue_client", + "name": "fetch_next_request", + "parsedDocstring": { + "text": "Return the next request in the queue to be processed.\n\nOnce you successfully finish processing of the request, you need to call `RequestQueue.mark_request_as_handled`\nto mark the request as handled in the queue. If there was some error in processing the request, call\n`RequestQueue.reclaim_request` instead, so that the queue will give the request to some other consumer\nin another call to the `fetch_next_request` method.\n\nNote that the `None` return value does not mean the queue processing finished, it means there are currently\nno pending requests. To check whether all requests in queue were finished, use `RequestQueue.is_finished`\ninstead.\n", + "returns": "The request or `None` if there are no more pending requests." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_memory/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 232 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "The request or `None` if there are no more pending requests." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Return the next request in the queue to be processed.\n\nOnce you successfully finish processing of the request, you need to call `RequestQueue.mark_request_as_handled`\nto mark the request as handled in the queue. If there was some error in processing the request, call\n`RequestQueue.reclaim_request` instead, so that the queue will give the request to some other consumer\nin another call to the `fetch_next_request` method.\n\nNote that the `None` return value does not mean the queue processing finished, it means there are currently\nno pending requests. To check whether all requests in queue were finished, use `RequestQueue.is_finished`\ninstead.\n" + } + ] + }, + "flags": {}, + "id": 2774, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "fetch_next_request", + "parameters": [], + "type": { + "name": "Request | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Request", + "target": "150" + }, + { + "type": "literal", + "value": null + } + ] + }, + "overwrites": { + "name": "RequestQueueClient.fetch_next_request", + "target": 2773, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RequestQueueClient.fetch_next_request", + "target": 2773, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Retrieve a request from the queue.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3149, + "module": "storage_clients._base._request_queue_client", + "name": "get_request", + "parsedDocstring": { + "text": "Retrieve a request from the queue.\n", + "args": { + "unique_key": "Unique key of the request to retrieve.\n" + }, + "returns": "The retrieved request, or None, if it did not exist." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_memory/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 251 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "The retrieved request, or None, if it did not exist." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Retrieve a request from the queue.\n" + } + ] + }, + "flags": {}, + "id": 2771, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_request", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Unique key of the request to retrieve.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2772, + "kind": 32768, + "kindString": "Parameter", + "name": "unique_key", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "Request | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Request", + "target": "150" + }, + { + "type": "literal", + "value": null + } + ] + }, + "overwrites": { + "name": "RequestQueueClient.get_request", + "target": 2770, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RequestQueueClient.get_request", + "target": 2770, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Mark a request as handled after successful processing.\n\nHandled requests will never again be returned by the `RequestQueue.fetch_next_request` method.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3152, + "module": "storage_clients._base._request_queue_client", + "name": "mark_request_as_handled", + "parsedDocstring": { + "text": "Mark a request as handled after successful processing.\n\nHandled requests will never again be returned by the `RequestQueue.fetch_next_request` method.\n", + "args": { + "request": "The request to mark as handled.\n" + }, + "returns": "Information about the queue operation. `None` if the given request was not in progress." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_memory/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 256 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "Information about the queue operation. `None` if the given request was not in progress." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Mark a request as handled after successful processing.\n\nHandled requests will never again be returned by the `RequestQueue.fetch_next_request` method.\n" + } + ] + }, + "flags": {}, + "id": 2776, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "mark_request_as_handled", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The request to mark as handled.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2777, + "kind": 32768, + "kindString": "Parameter", + "name": "request", + "type": { + "name": "Request", + "type": "reference", + "target": "150" + } + } + ], + "type": { + "name": "ProcessedRequest | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ProcessedRequest", + "target": "3665" + }, + { + "type": "literal", + "value": null + } + ] + }, + "overwrites": { + "name": "RequestQueueClient.mark_request_as_handled", + "target": 2775, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RequestQueueClient.mark_request_as_handled", + "target": 2775, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Reclaim a failed request back to the queue.\n\nThe request will be returned for processing later again by another call to `RequestQueue.fetch_next_request`.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3155, + "module": "storage_clients._base._request_queue_client", + "name": "reclaim_request", + "parsedDocstring": { + "text": "Reclaim a failed request back to the queue.\n\nThe request will be returned for processing later again by another call to `RequestQueue.fetch_next_request`.\n", + "args": { + "request": "The request to return to the queue.", + "forefront": "Whether to add the request to the head or the end of the queue.\n" + }, + "returns": "Information about the queue operation. `None` if the given request was not in progress." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_memory/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 288 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "Information about the queue operation. `None` if the given request was not in progress." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Reclaim a failed request back to the queue.\n\nThe request will be returned for processing later again by another call to `RequestQueue.fetch_next_request`.\n" + } + ] + }, + "flags": {}, + "id": 2779, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "reclaim_request", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The request to return to the queue." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2780, + "kind": 32768, + "kindString": "Parameter", + "name": "request", + "type": { + "name": "Request", + "type": "reference", + "target": "150" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Whether to add the request to the head or the end of the queue.\n" + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2781, + "kind": 32768, + "kindString": "Parameter", + "name": "forefront", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "ProcessedRequest | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ProcessedRequest", + "target": "3665" + }, + { + "type": "literal", + "value": null + } + ] + }, + "overwrites": { + "name": "RequestQueueClient.reclaim_request", + "target": 2778, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RequestQueueClient.reclaim_request", + "target": 2778, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Check if the queue is empty.\n" + } + ] + }, + "decorations": [ + { + "name": "override" + } + ], + "flags": {}, + "groups": [], + "id": 3159, + "module": "storage_clients._memory._request_queue_client", + "name": "is_empty", + "parsedDocstring": { + "text": "Check if the queue is empty.\n", + "returns": "True if the queue is empty, False otherwise." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_memory/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 317 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "True if the queue is empty, False otherwise." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Check if the queue is empty.\n" + } + ] + }, + "flags": {}, + "id": 3160, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "is_empty", + "parameters": [], + "type": { + "name": "bool", + "type": "reference" + }, + "overwrites": { + "name": "RequestQueueClient.is_empty", + "target": 2782, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RequestQueueClient.is_empty", + "target": 2782, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Memory implementation of the request queue client.\n\nNo data is persisted between process runs, which means all requests are lost when the program terminates.\nThis implementation is primarily useful for testing, development, and short-lived crawler runs where\npersistence is not required.\n\nThis client provides fast access to request data but is limited by available memory and does not support\ndata sharing across different processes." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 3129, + 3143, + 3139, + 3147, + 3132, + 3149, + 3159, + 3152, + 3134, + 3141, + 3155 + ], + "title": "Methods" + } + ], + "id": 3128, + "module": "storage_clients._memory._request_queue_client", + "name": "MemoryRequestQueueClient", + "parsedDocstring": { + "text": "Memory implementation of the request queue client.\n\nNo data is persisted between process runs, which means all requests are lost when the program terminates.\nThis implementation is primarily useful for testing, development, and short-lived crawler runs where\npersistence is not required.\n\nThis client provides fast access to request data but is limited by available memory and does not support\ndata sharing across different processes." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_memory/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 23 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "RequestQueueClient", + "target": "2759", + "type": "reference" + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3161, + "module": "storage_clients._redis._client_mixin", + "name": "logger", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_redis/_client_mixin.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 24 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3163, + "module": "storage_clients._redis._client_mixin", + "name": "update_accessed_at", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_redis/_client_mixin.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 30 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3164, + "module": "storage_clients._redis._client_mixin", + "name": "update_modified_at", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_redis/_client_mixin.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 31 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Parameters for updating metadata." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 3163, + 3164 + ], + "title": "Properties" + } + ], + "id": 3162, + "module": "storage_clients._redis._client_mixin", + "name": "MetadataUpdateParams", + "parsedDocstring": { + "text": "Parameters for updating metadata." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_redis/_client_mixin.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 27 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3166, + "module": "storage_clients._redis._client_mixin", + "name": "__init__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_redis/_client_mixin.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 49 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 3167, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 3168, + "kind": 32768, + "kindString": "Parameter", + "name": "storage_name", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 3169, + "kind": 32768, + "kindString": "Parameter", + "name": "storage_id", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 3170, + "kind": 32768, + "kindString": "Parameter", + "name": "redis", + "type": { + "name": "Redis", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the Redis client instance." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 3171, + "module": "storage_clients._redis._client_mixin", + "name": "redis", + "parsedDocstring": { + "text": "Return the Redis client instance." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_redis/_client_mixin.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 57 + } + ], + "type": { + "name": "Redis", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the Redis key for the metadata of this storage." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 3172, + "module": "storage_clients._redis._client_mixin", + "name": "metadata_key", + "parsedDocstring": { + "text": "Return the Redis key for the metadata of this storage." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_redis/_client_mixin.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 62 + } + ], + "type": { + "name": "str", + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Mixin class for Redis clients.\n\nThis mixin provides common Redis operations and basic methods for Redis storage clients." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 3166 + ], + "title": "Methods" + }, + { + "children": [ + 3172, + 3171 + ], + "title": "Properties" + } + ], + "id": 3165, + "module": "storage_clients._redis._client_mixin", + "name": "RedisClientMixin", + "parsedDocstring": { + "text": "Mixin class for Redis clients.\n\nThis mixin provides common Redis operations and basic methods for Redis storage clients." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_redis/_client_mixin.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 34 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedBy": [ + { + "name": "RedisDatasetClient", + "target": "3177", + "type": "reference" + }, + { + "name": "RedisKeyValueStoreClient", + "target": "3248", + "type": "reference" + }, + { + "name": "RedisRequestQueueClient", + "target": "3304", + "type": "reference" + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3173, + "module": "storage_clients._redis._dataset_client", + "name": "logger", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_redis/_dataset_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 20 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3175, + "module": "storage_clients._redis._dataset_client", + "name": "new_item_count", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_redis/_dataset_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 26 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3176, + "module": "storage_clients._redis._dataset_client", + "name": "delta_item_count", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_redis/_dataset_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 27 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Parameters for updating dataset metadata." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 3176, + 3175 + ], + "title": "Properties" + } + ], + "id": 3174, + "module": "storage_clients._redis._dataset_client", + "name": "_DatasetMetadataUpdateParams", + "parsedDocstring": { + "text": "Parameters for updating dataset metadata." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_redis/_dataset_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 23 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n\nPreferably use the `RedisDatasetClient.open` class method to create a new instance.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3178, + "module": "storage_clients._redis._dataset_client", + "name": "__init__", + "parsedDocstring": { + "text": "Initialize a new instance.\n\nPreferably use the `RedisDatasetClient.open` class method to create a new instance.\n", + "args": { + "storage_name": "Internal storage name used for Redis keys.", + "storage_id": "Unique identifier for the dataset.", + "redis": "Redis client instance." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_redis/_dataset_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 54 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n\nPreferably use the `RedisDatasetClient.open` class method to create a new instance.\n" + } + ] + }, + "flags": {}, + "id": 3179, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Internal storage name used for Redis keys." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 3180, + "kind": 32768, + "kindString": "Parameter", + "name": "storage_name", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Unique identifier for the dataset." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 3181, + "kind": 32768, + "kindString": "Parameter", + "name": "storage_id", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Redis client instance." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 3182, + "kind": 32768, + "kindString": "Parameter", + "name": "redis", + "type": { + "name": "Redis", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "RedisClientMixin.__init__", + "target": 3166, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RedisClientMixin.__init__", + "target": 3166, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Open or create a new Redis dataset client.\n\nThis method attempts to open an existing dataset from the Redis database. If a dataset with the specified\nID or name exists, it loads the metadata from the database. If no existing store is found, a new one\nis created.\n" + } + ] + }, + "decorations": [ + { + "name": "classmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3183, + "module": "storage_clients._redis._dataset_client", + "name": "open", + "parsedDocstring": { + "text": "Open or create a new Redis dataset client.\n\nThis method attempts to open an existing dataset from the Redis database. If a dataset with the specified\nID or name exists, it loads the metadata from the database. If no existing store is found, a new one\nis created.\n", + "args": { + "id": "The ID of the dataset. If not provided, a random ID will be generated.", + "name": "The name of the dataset for named (global scope) storages.", + "alias": "The alias of the dataset for unnamed (run scope) storages.", + "redis": "Redis client instance.\n" + }, + "returns": "An instance for the opened or created storage client." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_redis/_dataset_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 72 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "An instance for the opened or created storage client." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Open or create a new Redis dataset client.\n\nThis method attempts to open an existing dataset from the Redis database. If a dataset with the specified\nID or name exists, it loads the metadata from the database. If no existing store is found, a new one\nis created.\n" + } + ] + }, + "flags": {}, + "id": 3184, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "open", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The ID of the dataset. If not provided, a random ID will be generated." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 3185, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The name of the dataset for named (global scope) storages." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 3186, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The alias of the dataset for unnamed (run scope) storages." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 3187, + "kind": 32768, + "kindString": "Parameter", + "name": "alias", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Redis client instance.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 3188, + "kind": 32768, + "kindString": "Parameter", + "name": "redis", + "type": { + "name": "Redis", + "type": "reference" + } + } + ], + "type": { + "name": "RedisDatasetClient", + "type": "reference", + "target": "3177" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the metadata of the dataset." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3189, + "module": "storage_clients._base._dataset_client", + "name": "get_metadata", + "parsedDocstring": { + "text": "Get the metadata of the dataset." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_redis/_dataset_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 106 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the metadata of the dataset." + } + ] + }, + "flags": {}, + "id": 2810, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_metadata", + "parameters": [], + "type": { + "name": "DatasetMetadata", + "type": "reference", + "target": "3639" + }, + "overwrites": { + "name": "DatasetClient.get_metadata", + "target": 2809, + "type": "reference" + } + } + ], + "overwrites": { + "name": "DatasetClient.get_metadata", + "target": 2809, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Drop the whole dataset and remove all its items.\n\nThe backend method for the `Dataset.drop` call." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3191, + "module": "storage_clients._base._dataset_client", + "name": "drop", + "parsedDocstring": { + "text": "Drop the whole dataset and remove all its items.\n\nThe backend method for the `Dataset.drop` call." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_redis/_dataset_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 110 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Drop the whole dataset and remove all its items.\n\nThe backend method for the `Dataset.drop` call." + } + ] + }, + "flags": {}, + "id": 2812, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "drop", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "DatasetClient.drop", + "target": 2811, + "type": "reference" + } + } + ], + "overwrites": { + "name": "DatasetClient.drop", + "target": 2811, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Purge all items from the dataset.\n\nThe backend method for the `Dataset.purge` call." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3193, + "module": "storage_clients._base._dataset_client", + "name": "purge", + "parsedDocstring": { + "text": "Purge all items from the dataset.\n\nThe backend method for the `Dataset.purge` call." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_redis/_dataset_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 114 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Purge all items from the dataset.\n\nThe backend method for the `Dataset.purge` call." + } + ] + }, + "flags": {}, + "id": 2814, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "purge", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "DatasetClient.purge", + "target": 2813, + "type": "reference" + } + } + ], + "overwrites": { + "name": "DatasetClient.purge", + "target": 2813, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Push data to the dataset.\n\nThe backend method for the `Dataset.push_data` call." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3195, + "module": "storage_clients._base._dataset_client", + "name": "push_data", + "parsedDocstring": { + "text": "Push data to the dataset.\n\nThe backend method for the `Dataset.push_data` call." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_redis/_dataset_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 123 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Push data to the dataset.\n\nThe backend method for the `Dataset.push_data` call." + } + ] + }, + "flags": {}, + "id": 2816, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "push_data", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2817, + "kind": 32768, + "kindString": "Parameter", + "name": "data", + "type": { + "name": "list[Any] | dict[str, Any]", + "type": "union", + "types": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "Any" + } + ] + }, + { + "type": "reference", + "name": "dict", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Any" + } + ] + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "DatasetClient.push_data", + "target": 2815, + "type": "reference" + } + } + ], + "overwrites": { + "name": "DatasetClient.push_data", + "target": 2815, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get data from the dataset with various filtering options.\n\nThe backend method for the `Dataset.get_data` call." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3198, + "module": "storage_clients._base._dataset_client", + "name": "get_data", + "parsedDocstring": { + "text": "Get data from the dataset with various filtering options.\n\nThe backend method for the `Dataset.get_data` call." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_redis/_dataset_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 137 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get data from the dataset with various filtering options.\n\nThe backend method for the `Dataset.get_data` call." + } + ] + }, + "flags": {}, + "id": 2819, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_data", + "parameters": [ + { + "defaultValue": "0", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2820, + "kind": 32768, + "kindString": "Parameter", + "name": "offset", + "type": { + "name": "int", + "type": "reference" + } + }, + { + "defaultValue": "999_999_999_999", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2821, + "kind": 32768, + "kindString": "Parameter", + "name": "limit", + "type": { + "name": "int | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2822, + "kind": 32768, + "kindString": "Parameter", + "name": "clean", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2823, + "kind": 32768, + "kindString": "Parameter", + "name": "desc", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2824, + "kind": 32768, + "kindString": "Parameter", + "name": "fields", + "type": { + "name": "list[str] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2825, + "kind": 32768, + "kindString": "Parameter", + "name": "omit", + "type": { + "name": "list[str] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2826, + "kind": 32768, + "kindString": "Parameter", + "name": "unwind", + "type": { + "name": "list[str] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2827, + "kind": 32768, + "kindString": "Parameter", + "name": "skip_empty", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2828, + "kind": 32768, + "kindString": "Parameter", + "name": "skip_hidden", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2829, + "kind": 32768, + "kindString": "Parameter", + "name": "flatten", + "type": { + "name": "list[str] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2830, + "kind": 32768, + "kindString": "Parameter", + "name": "view", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "DatasetItemsListPage", + "type": "reference", + "target": "3658" + }, + "overwrites": { + "name": "DatasetClient.get_data", + "target": 2818, + "type": "reference" + } + } + ], + "overwrites": { + "name": "DatasetClient.get_data", + "target": 2818, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Iterate over dataset items one by one.\n\nThis method yields items individually instead of loading all items at once,\nwhich is more memory efficient for large datasets." + } + ] + }, + "decorations": [ + { + "name": "override" + } + ], + "flags": {}, + "groups": [], + "id": 3211, + "module": "storage_clients._redis._dataset_client", + "name": "iterate_items", + "parsedDocstring": { + "text": "Iterate over dataset items one by one.\n\nThis method yields items individually instead of loading all items at once,\nwhich is more memory efficient for large datasets." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_redis/_dataset_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 221 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Iterate over dataset items one by one.\n\nThis method yields items individually instead of loading all items at once,\nwhich is more memory efficient for large datasets." + } + ] + }, + "flags": {}, + "id": 3212, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "iterate_items", + "parameters": [ + { + "defaultValue": "0", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 3213, + "kind": 32768, + "kindString": "Parameter", + "name": "offset", + "type": { + "name": "int", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 3214, + "kind": 32768, + "kindString": "Parameter", + "name": "limit", + "type": { + "name": "int | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 3215, + "kind": 32768, + "kindString": "Parameter", + "name": "clean", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 3216, + "kind": 32768, + "kindString": "Parameter", + "name": "desc", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 3217, + "kind": 32768, + "kindString": "Parameter", + "name": "fields", + "type": { + "name": "list[str] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 3218, + "kind": 32768, + "kindString": "Parameter", + "name": "omit", + "type": { + "name": "list[str] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 3219, + "kind": 32768, + "kindString": "Parameter", + "name": "unwind", + "type": { + "name": "list[str] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 3220, + "kind": 32768, + "kindString": "Parameter", + "name": "skip_empty", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 3221, + "kind": 32768, + "kindString": "Parameter", + "name": "skip_hidden", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "AsyncIterator", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "dict", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Any" + } + ] + } + ] + }, + "overwrites": { + "name": "DatasetClient.iterate_items", + "target": 2831, + "type": "reference" + } + } + ], + "overwrites": { + "name": "DatasetClient.iterate_items", + "target": 2831, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the Redis client instance." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 3977, + "module": "storage_clients._redis._client_mixin", + "name": "redis", + "parsedDocstring": { + "text": "Return the Redis client instance." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_redis/_client_mixin.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 57 + } + ], + "type": { + "name": "Redis", + "type": "reference" + }, + "inheritedFrom": { + "name": "RedisClientMixin.redis", + "target": 3171, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the Redis key for the metadata of this storage." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 3978, + "module": "storage_clients._redis._client_mixin", + "name": "metadata_key", + "parsedDocstring": { + "text": "Return the Redis key for the metadata of this storage." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_redis/_client_mixin.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 62 + } + ], + "type": { + "name": "str", + "type": "reference" + }, + "inheritedFrom": { + "name": "RedisClientMixin.metadata_key", + "target": 3172, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Redis implementation of the dataset client.\n\nThis client persists dataset items to Redis using JSON arrays for efficient storage and retrieval.\nItems are stored as JSON objects with automatic ordering preservation through Redis list operations.\n\nThe dataset data is stored in Redis using the following key pattern:\n- `datasets:{name}:items` - Redis JSON array containing all dataset items.\n- `datasets:{name}:metadata` - Redis JSON object containing dataset metadata.\n\nItems must be JSON-serializable dictionaries. Single items or lists of items can be pushed to the dataset.\nThe item ordering is preserved through Redis JSON array operations. All operations provide atomic consistency\nthrough Redis transactions and pipeline operations." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 3178, + 3191, + 3198, + 3189, + 3211, + 3183, + 3193, + 3195 + ], + "title": "Methods" + }, + { + "children": [ + 3978, + 3977 + ], + "title": "Properties" + } + ], + "id": 3177, + "module": "storage_clients._redis._dataset_client", + "name": "RedisDatasetClient", + "parsedDocstring": { + "text": "Redis implementation of the dataset client.\n\nThis client persists dataset items to Redis using JSON arrays for efficient storage and retrieval.\nItems are stored as JSON objects with automatic ordering preservation through Redis list operations.\n\nThe dataset data is stored in Redis using the following key pattern:\n- `datasets:{name}:items` - Redis JSON array containing all dataset items.\n- `datasets:{name}:metadata` - Redis JSON object containing dataset metadata.\n\nItems must be JSON-serializable dictionaries. Single items or lists of items can be pushed to the dataset.\nThe item ordering is preserved through Redis JSON array operations. All operations provide atomic consistency\nthrough Redis transactions and pipeline operations." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_redis/_dataset_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 30 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "RedisClientMixin", + "target": "3165", + "type": "reference" + }, + { + "name": "DatasetClient", + "target": "2808", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize the Redis storage client.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3223, + "module": "storage_clients._redis._storage_client", + "name": "__init__", + "parsedDocstring": { + "text": "Initialize the Redis storage client.\n", + "args": { + "connection_string": "Redis connection string (e.g., \"redis://localhost:6379\").\nSupports standard Redis URL format with optional database selection.", + "redis": "Pre-configured Redis client instance.", + "queue_dedup_strategy": "Strategy for request queue deduplication. Options are:\n- 'default': Uses Redis sets for exact deduplication.\n- 'bloom': Uses Redis Bloom filters for probabilistic deduplication with lower memory usage. When using\nthis approach, approximately 1 in 1e-7 requests will be falsely considered duplicate.", + "queue_bloom_error_rate": "Desired false positive rate for Bloom filter deduplication. Only relevant if\n`queue_dedup_strategy` is set to 'bloom'." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_redis/_storage_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 39 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize the Redis storage client.\n" + } + ] + }, + "flags": {}, + "id": 3224, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Redis connection string (e.g., \"redis://localhost:6379\").\nSupports standard Redis URL format with optional database selection." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 3225, + "kind": 32768, + "kindString": "Parameter", + "name": "connection_string", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Pre-configured Redis client instance." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 3226, + "kind": 32768, + "kindString": "Parameter", + "name": "redis", + "type": { + "name": "Redis | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Redis" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Strategy for request queue deduplication. Options are:\n- 'default': Uses Redis sets for exact deduplication.\n- 'bloom': Uses Redis Bloom filters for probabilistic deduplication with lower memory usage. When using\nthis approach, approximately 1 in 1e-7 requests will be falsely considered duplicate." + } + ] + }, + "defaultValue": "'default'", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 3227, + "kind": 32768, + "kindString": "Parameter", + "name": "queue_dedup_strategy", + "type": { + "name": "Literal", + "type": "reference", + "typeArguments": [ + { + "type": "literal", + "value": "default" + }, + { + "type": "literal", + "value": "bloom" + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Desired false positive rate for Bloom filter deduplication. Only relevant if\n`queue_dedup_strategy` is set to 'bloom'." + } + ] + }, + "defaultValue": "1e-7", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 3228, + "kind": 32768, + "kindString": "Parameter", + "name": "queue_bloom_error_rate", + "type": { + "name": "float", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a dataset client." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3229, + "module": "storage_clients._base._storage_client", + "name": "create_dataset_client", + "parsedDocstring": { + "text": "Create a dataset client." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_redis/_storage_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 87 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a dataset client." + } + ] + }, + "flags": {}, + "id": 2789, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "create_dataset_client", + "parameters": [ + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2790, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2791, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2792, + "kind": 32768, + "kindString": "Parameter", + "name": "alias", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2793, + "kind": 32768, + "kindString": "Parameter", + "name": "configuration", + "type": { + "name": "Configuration | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Configuration", + "target": "235" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "DatasetClient", + "type": "reference", + "target": "2808" + }, + "overwrites": { + "name": "StorageClient.create_dataset_client", + "target": 2788, + "type": "reference" + } + } + ], + "overwrites": { + "name": "StorageClient.create_dataset_client", + "target": 2788, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a key-value store client." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3235, + "module": "storage_clients._base._storage_client", + "name": "create_kvs_client", + "parsedDocstring": { + "text": "Create a key-value store client." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_redis/_storage_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 108 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a key-value store client." + } + ] + }, + "flags": {}, + "id": 2795, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "create_kvs_client", + "parameters": [ + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2796, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2797, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2798, + "kind": 32768, + "kindString": "Parameter", + "name": "alias", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2799, + "kind": 32768, + "kindString": "Parameter", + "name": "configuration", + "type": { + "name": "Configuration | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Configuration", + "target": "235" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "KeyValueStoreClient", + "type": "reference", + "target": "2842" + }, + "overwrites": { + "name": "StorageClient.create_kvs_client", + "target": 2794, + "type": "reference" + } + } + ], + "overwrites": { + "name": "StorageClient.create_kvs_client", + "target": 2794, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a request queue client." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3241, + "module": "storage_clients._base._storage_client", + "name": "create_rq_client", + "parsedDocstring": { + "text": "Create a request queue client." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_redis/_storage_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 129 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a request queue client." + } + ] + }, + "flags": {}, + "id": 2801, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "create_rq_client", + "parameters": [ + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2802, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2803, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2804, + "kind": 32768, + "kindString": "Parameter", + "name": "alias", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2805, + "kind": 32768, + "kindString": "Parameter", + "name": "configuration", + "type": { + "name": "Configuration | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Configuration", + "target": "235" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "RequestQueueClient", + "type": "reference", + "target": "2759" + }, + "overwrites": { + "name": "StorageClient.create_rq_client", + "target": 2800, + "type": "reference" + } + } + ], + "overwrites": { + "name": "StorageClient.create_rq_client", + "target": 2800, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return a cache key that can differentiate between different storages of this and other clients.\n\nCan be based on configuration or on the client itself. By default, returns a module and name of the client\nclass." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3990, + "module": "storage_clients._base._storage_client", + "name": "get_storage_client_cache_key", + "parsedDocstring": { + "text": "Return a cache key that can differentiate between different storages of this and other clients.\n\nCan be based on configuration or on the client itself. By default, returns a module and name of the client\nclass." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_base/_storage_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 33 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return a cache key that can differentiate between different storages of this and other clients.\n\nCan be based on configuration or on the client itself. By default, returns a module and name of the client\nclass." + } + ] + }, + "flags": {}, + "id": 2786, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "get_storage_client_cache_key", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2787, + "kind": 32768, + "kindString": "Parameter", + "name": "configuration", + "type": { + "name": "Configuration", + "type": "reference", + "target": "235" + } + } + ], + "type": { + "name": "Hashable", + "type": "reference" + }, + "inheritedFrom": { + "name": "StorageClient.get_storage_client_cache_key", + "target": 2785, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "StorageClient.get_storage_client_cache_key", + "target": 2785, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return statistics about rate limit errors encountered by the HTTP client in storage client." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3991, + "module": "storage_clients._base._storage_client", + "name": "get_rate_limit_errors", + "parsedDocstring": { + "text": "Return statistics about rate limit errors encountered by the HTTP client in storage client." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_base/_storage_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 74 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return statistics about rate limit errors encountered by the HTTP client in storage client." + } + ] + }, + "flags": {}, + "id": 2807, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "get_rate_limit_errors", + "parameters": [], + "type": { + "name": "dict[int, int]", + "type": "reference" + }, + "inheritedFrom": { + "name": "StorageClient.get_rate_limit_errors", + "target": 2806, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "StorageClient.get_rate_limit_errors", + "target": 2806, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Redis implementation of the storage client.\n\nThis storage client provides access to datasets, key-value stores, and request queues that persist data\nto a Redis database v8.0+. Each storage type uses Redis-specific data structures and key patterns for\nefficient storage and retrieval.\n\nThe client accepts either a Redis connection string or a pre-configured Redis client instance.\nExactly one of these parameters must be provided during initialization.\n\nStorage types use the following Redis data structures:\n- **Datasets**: Redis JSON arrays for item storage with metadata in JSON objects\n- **Key-value stores**: Redis hashes for key-value pairs with separate metadata storage\n- **Request queues**: Redis lists for FIFO queuing, hashes for request data and in-progress tracking,\nand Bloom filters for request deduplication\n\n\n:::warning Warning\nThis is an experimental feature. The behavior and interface may change in future versions.\n:::" + } + ] + }, + "decorations": [ + { + "args": "('Storage clients')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 3223, + 3229, + 3235, + 3241, + 3991, + 3990 + ], + "title": "Methods" + } + ], + "id": 3222, + "module": "storage_clients._redis._storage_client", + "name": "RedisStorageClient", + "parsedDocstring": { + "text": "Redis implementation of the storage client.\n\nThis storage client provides access to datasets, key-value stores, and request queues that persist data\nto a Redis database v8.0+. Each storage type uses Redis-specific data structures and key patterns for\nefficient storage and retrieval.\n\nThe client accepts either a Redis connection string or a pre-configured Redis client instance.\nExactly one of these parameters must be provided during initialization.\n\nStorage types use the following Redis data structures:\n- **Datasets**: Redis JSON arrays for item storage with metadata in JSON objects\n- **Key-value stores**: Redis hashes for key-value pairs with separate metadata storage\n- **Request queues**: Redis lists for FIFO queuing, hashes for request data and in-progress tracking,\nand Bloom filters for request deduplication\n\n\n:::warning Warning\nThis is an experimental feature. The behavior and interface may change in future versions.\n:::" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_redis/_storage_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 19 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "StorageClient", + "target": "2784", + "type": "reference" + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3247, + "module": "storage_clients._redis._key_value_store_client", + "name": "logger", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_redis/_key_value_store_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 21 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n\nPreferably use the `RedisKeyValueStoreClient.open` class method to create a new instance." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3249, + "module": "storage_clients._redis._key_value_store_client", + "name": "__init__", + "parsedDocstring": { + "text": "Initialize a new instance.\n\nPreferably use the `RedisKeyValueStoreClient.open` class method to create a new instance." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_redis/_key_value_store_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 52 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n\nPreferably use the `RedisKeyValueStoreClient.open` class method to create a new instance." + } + ] + }, + "flags": {}, + "id": 3250, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 3251, + "kind": 32768, + "kindString": "Parameter", + "name": "storage_name", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 3252, + "kind": 32768, + "kindString": "Parameter", + "name": "storage_id", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 3253, + "kind": 32768, + "kindString": "Parameter", + "name": "redis", + "type": { + "name": "Redis", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "RedisClientMixin.__init__", + "target": 3166, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RedisClientMixin.__init__", + "target": 3166, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Open or create a new Redis key-value store client.\n\nThis method attempts to open an existing key-value store from the Redis database. If a store with the specified\nID or name exists, it loads the metadata from the database. If no existing store is found, a new one\nis created.\n" + } + ] + }, + "decorations": [ + { + "name": "classmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3254, + "module": "storage_clients._redis._key_value_store_client", + "name": "open", + "parsedDocstring": { + "text": "Open or create a new Redis key-value store client.\n\nThis method attempts to open an existing key-value store from the Redis database. If a store with the specified\nID or name exists, it loads the metadata from the database. If no existing store is found, a new one\nis created.\n", + "args": { + "id": "The ID of the key-value store. If not provided, a random ID will be generated.", + "name": "The name of the key-value store for named (global scope) storages.", + "alias": "The alias of the key-value store for unnamed (run scope) storages.", + "redis": "Redis client instance.\n" + }, + "returns": "An instance for the opened or created storage client." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_redis/_key_value_store_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 70 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "An instance for the opened or created storage client." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Open or create a new Redis key-value store client.\n\nThis method attempts to open an existing key-value store from the Redis database. If a store with the specified\nID or name exists, it loads the metadata from the database. If no existing store is found, a new one\nis created.\n" + } + ] + }, + "flags": {}, + "id": 3255, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "open", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The ID of the key-value store. If not provided, a random ID will be generated." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 3256, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The name of the key-value store for named (global scope) storages." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 3257, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The alias of the key-value store for unnamed (run scope) storages." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 3258, + "kind": 32768, + "kindString": "Parameter", + "name": "alias", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Redis client instance.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 3259, + "kind": 32768, + "kindString": "Parameter", + "name": "redis", + "type": { + "name": "Redis", + "type": "reference" + } + } + ], + "type": { + "name": "RedisKeyValueStoreClient", + "type": "reference", + "target": "3248" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the metadata of the key-value store." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3260, + "module": "storage_clients._base._key_value_store_client", + "name": "get_metadata", + "parsedDocstring": { + "text": "Get the metadata of the key-value store." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_redis/_key_value_store_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 104 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the metadata of the key-value store." + } + ] + }, + "flags": {}, + "id": 2844, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_metadata", + "parameters": [], + "type": { + "name": "KeyValueStoreMetadata", + "type": "reference", + "target": "3642" + }, + "overwrites": { + "name": "KeyValueStoreClient.get_metadata", + "target": 2843, + "type": "reference" + } + } + ], + "overwrites": { + "name": "KeyValueStoreClient.get_metadata", + "target": 2843, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Drop the whole key-value store and remove all its values.\n\nThe backend method for the `KeyValueStore.drop` call." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3262, + "module": "storage_clients._base._key_value_store_client", + "name": "drop", + "parsedDocstring": { + "text": "Drop the whole key-value store and remove all its values.\n\nThe backend method for the `KeyValueStore.drop` call." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_redis/_key_value_store_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 108 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Drop the whole key-value store and remove all its values.\n\nThe backend method for the `KeyValueStore.drop` call." + } + ] + }, + "flags": {}, + "id": 2846, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "drop", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "KeyValueStoreClient.drop", + "target": 2845, + "type": "reference" + } + } + ], + "overwrites": { + "name": "KeyValueStoreClient.drop", + "target": 2845, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Purge all items from the key-value store.\n\nThe backend method for the `KeyValueStore.purge` call." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3264, + "module": "storage_clients._base._key_value_store_client", + "name": "purge", + "parsedDocstring": { + "text": "Purge all items from the key-value store.\n\nThe backend method for the `KeyValueStore.purge` call." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_redis/_key_value_store_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 112 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Purge all items from the key-value store.\n\nThe backend method for the `KeyValueStore.purge` call." + } + ] + }, + "flags": {}, + "id": 2848, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "purge", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "KeyValueStoreClient.purge", + "target": 2847, + "type": "reference" + } + } + ], + "overwrites": { + "name": "KeyValueStoreClient.purge", + "target": 2847, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Set a value in the key-value store by its key.\n\nThe backend method for the `KeyValueStore.set_value` call." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3266, + "module": "storage_clients._base._key_value_store_client", + "name": "set_value", + "parsedDocstring": { + "text": "Set a value in the key-value store by its key.\n\nThe backend method for the `KeyValueStore.set_value` call." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_redis/_key_value_store_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 119 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Set a value in the key-value store by its key.\n\nThe backend method for the `KeyValueStore.set_value` call." + } + ] + }, + "flags": {}, + "id": 2853, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "set_value", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2854, + "kind": 32768, + "kindString": "Parameter", + "name": "key", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2855, + "kind": 32768, + "kindString": "Parameter", + "name": "value", + "type": { + "name": "Any", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2856, + "kind": 32768, + "kindString": "Parameter", + "name": "content_type", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "KeyValueStoreClient.set_value", + "target": 2852, + "type": "reference" + } + } + ], + "overwrites": { + "name": "KeyValueStoreClient.set_value", + "target": 2852, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Retrieve the given record from the key-value store.\n\nThe backend method for the `KeyValueStore.get_value` call." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3271, + "module": "storage_clients._base._key_value_store_client", + "name": "get_value", + "parsedDocstring": { + "text": "Retrieve the given record from the key-value store.\n\nThe backend method for the `KeyValueStore.get_value` call." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_redis/_key_value_store_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 159 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Retrieve the given record from the key-value store.\n\nThe backend method for the `KeyValueStore.get_value` call." + } + ] + }, + "flags": {}, + "id": 2850, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_value", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2851, + "kind": 32768, + "kindString": "Parameter", + "name": "key", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "KeyValueStoreRecord | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "KeyValueStoreRecord", + "target": "3655" + }, + { + "type": "literal", + "value": null + } + ] + }, + "overwrites": { + "name": "KeyValueStoreClient.get_value", + "target": 2849, + "type": "reference" + } + } + ], + "overwrites": { + "name": "KeyValueStoreClient.get_value", + "target": 2849, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Delete a value from the key-value store by its key.\n\nThe backend method for the `KeyValueStore.delete_value` call." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3274, + "module": "storage_clients._base._key_value_store_client", + "name": "delete_value", + "parsedDocstring": { + "text": "Delete a value from the key-value store by its key.\n\nThe backend method for the `KeyValueStore.delete_value` call." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_redis/_key_value_store_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 204 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Delete a value from the key-value store by its key.\n\nThe backend method for the `KeyValueStore.delete_value` call." + } + ] + }, + "flags": {}, + "id": 2858, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "delete_value", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2859, + "kind": 32768, + "kindString": "Parameter", + "name": "key", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "KeyValueStoreClient.delete_value", + "target": 2857, + "type": "reference" + } + } + ], + "overwrites": { + "name": "KeyValueStoreClient.delete_value", + "target": 2857, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Iterate over all the existing keys in the key-value store.\n\nThe backend method for the `KeyValueStore.iterate_keys` call." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3277, + "module": "storage_clients._base._key_value_store_client", + "name": "iterate_keys", + "parsedDocstring": { + "text": "Iterate over all the existing keys in the key-value store.\n\nThe backend method for the `KeyValueStore.iterate_keys` call." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_redis/_key_value_store_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 211 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Iterate over all the existing keys in the key-value store.\n\nThe backend method for the `KeyValueStore.iterate_keys` call." + } + ] + }, + "flags": {}, + "id": 2861, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "iterate_keys", + "parameters": [ + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2862, + "kind": 32768, + "kindString": "Parameter", + "name": "exclusive_start_key", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2863, + "kind": 32768, + "kindString": "Parameter", + "name": "limit", + "type": { + "name": "int | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "AsyncIterator", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "KeyValueStoreRecordMetadata", + "target": "3650" + } + ] + }, + "overwrites": { + "name": "KeyValueStoreClient.iterate_keys", + "target": 2860, + "type": "reference" + } + } + ], + "overwrites": { + "name": "KeyValueStoreClient.iterate_keys", + "target": 2860, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the public URL for the given key.\n\nThe backend method for the `KeyValueStore.get_public_url` call." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3281, + "module": "storage_clients._base._key_value_store_client", + "name": "get_public_url", + "parsedDocstring": { + "text": "Get the public URL for the given key.\n\nThe backend method for the `KeyValueStore.get_public_url` call." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_redis/_key_value_store_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 251 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the public URL for the given key.\n\nThe backend method for the `KeyValueStore.get_public_url` call." + } + ] + }, + "flags": {}, + "id": 2865, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_public_url", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2866, + "kind": 32768, + "kindString": "Parameter", + "name": "key", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "str", + "type": "reference" + }, + "overwrites": { + "name": "KeyValueStoreClient.get_public_url", + "target": 2864, + "type": "reference" + } + } + ], + "overwrites": { + "name": "KeyValueStoreClient.get_public_url", + "target": 2864, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Check if a record with the given key exists in the key-value store.\n\nThe backend method for the `KeyValueStore.record_exists` call.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3284, + "module": "storage_clients._base._key_value_store_client", + "name": "record_exists", + "parsedDocstring": { + "text": "Check if a record with the given key exists in the key-value store.\n\nThe backend method for the `KeyValueStore.record_exists` call.\n", + "args": { + "key": "The key to check for existence.\n" + }, + "returns": "True if a record with the given key exists, False otherwise." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_redis/_key_value_store_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 255 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "True if a record with the given key exists, False otherwise." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Check if a record with the given key exists in the key-value store.\n\nThe backend method for the `KeyValueStore.record_exists` call.\n" + } + ] + }, + "flags": {}, + "id": 2868, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "record_exists", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The key to check for existence.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2869, + "kind": 32768, + "kindString": "Parameter", + "name": "key", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "bool", + "type": "reference" + }, + "overwrites": { + "name": "KeyValueStoreClient.record_exists", + "target": 2867, + "type": "reference" + } + } + ], + "overwrites": { + "name": "KeyValueStoreClient.record_exists", + "target": 2867, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the Redis client instance." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 3979, + "module": "storage_clients._redis._client_mixin", + "name": "redis", + "parsedDocstring": { + "text": "Return the Redis client instance." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_redis/_client_mixin.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 57 + } + ], + "type": { + "name": "Redis", + "type": "reference" + }, + "inheritedFrom": { + "name": "RedisClientMixin.redis", + "target": 3171, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the Redis key for the metadata of this storage." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 3980, + "module": "storage_clients._redis._client_mixin", + "name": "metadata_key", + "parsedDocstring": { + "text": "Return the Redis key for the metadata of this storage." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_redis/_client_mixin.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 62 + } + ], + "type": { + "name": "str", + "type": "reference" + }, + "inheritedFrom": { + "name": "RedisClientMixin.metadata_key", + "target": 3172, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Redis implementation of the key-value store client.\n\nThis client persists key-value data to Redis using hash data structures for efficient storage and retrieval.\nKeys are mapped to values with automatic content type detection and size tracking for metadata management.\n\nThe key-value store data is stored in Redis using the following key pattern:\n- `key_value_stores:{name}:items` - Redis hash containing key-value pairs (values stored as binary data).\n- `key_value_stores:{name}:metadata_items` - Redis hash containing metadata for each key.\n- `key_value_stores:{name}:metadata` - Redis JSON object containing store metadata.\n\nValues are serialized based on their type: JSON objects are stored as UTF-8 encoded JSON strings,\ntext values as UTF-8 encoded strings, and binary data as-is. The implementation automatically handles\ncontent type detection and maintains metadata about each record including size and MIME type information.\n\nAll operations are atomic through Redis hash operations and pipeline transactions. The client supports\nconcurrent access through Redis's built-in atomic operations for hash fields." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 3249, + 3274, + 3262, + 3260, + 3281, + 3271, + 3277, + 3254, + 3264, + 3284, + 3266 + ], + "title": "Methods" + }, + { + "children": [ + 3980, + 3979 + ], + "title": "Properties" + } + ], + "id": 3248, + "module": "storage_clients._redis._key_value_store_client", + "name": "RedisKeyValueStoreClient", + "parsedDocstring": { + "text": "Redis implementation of the key-value store client.\n\nThis client persists key-value data to Redis using hash data structures for efficient storage and retrieval.\nKeys are mapped to values with automatic content type detection and size tracking for metadata management.\n\nThe key-value store data is stored in Redis using the following key pattern:\n- `key_value_stores:{name}:items` - Redis hash containing key-value pairs (values stored as binary data).\n- `key_value_stores:{name}:metadata_items` - Redis hash containing metadata for each key.\n- `key_value_stores:{name}:metadata` - Redis JSON object containing store metadata.\n\nValues are serialized based on their type: JSON objects are stored as UTF-8 encoded JSON strings,\ntext values as UTF-8 encoded strings, and binary data as-is. The implementation automatically handles\ncontent type detection and maintains metadata about each record including size and MIME type information.\n\nAll operations are atomic through Redis hash operations and pipeline transactions. The client supports\nconcurrent access through Redis's built-in atomic operations for hash fields." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_redis/_key_value_store_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 24 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "RedisClientMixin", + "target": "3165", + "type": "reference" + }, + { + "name": "KeyValueStoreClient", + "target": "2842", + "type": "reference" + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3287, + "module": "storage_clients._redis._utils", + "name": "T", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_redis/_utils.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 5 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Solve the problem of ambiguous typing for redis." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3288, + "module": "storage_clients._redis._utils", + "name": "await_redis_response", + "parsedDocstring": { + "text": "Solve the problem of ambiguous typing for redis." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_redis/_utils.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 14 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Solve the problem of ambiguous typing for redis." + } + ] + }, + "flags": {}, + "id": 3289, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "await_redis_response", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 3290, + "kind": 32768, + "kindString": "Parameter", + "name": "response", + "type": { + "name": "Awaitable[T] | T", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Awaitable", + "typeArguments": [ + { + "type": "reference", + "name": "T", + "target": "299" + } + ] + }, + { + "type": "reference", + "name": "T", + "target": "299" + } + ] + } + } + ], + "type": { + "name": "T", + "type": "reference", + "target": "299" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Read a Lua script from a file." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3291, + "module": "storage_clients._redis._utils", + "name": "read_lua_script", + "parsedDocstring": { + "text": "Read a Lua script from a file." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_redis/_utils.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 21 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Read a Lua script from a file." + } + ] + }, + "flags": {}, + "id": 3292, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "read_lua_script", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 3293, + "kind": 32768, + "kindString": "Parameter", + "name": "script_name", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "str", + "type": "reference" + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3294, + "module": "storage_clients._redis._request_queue_client", + "name": "logger", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_redis/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 26 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3296, + "module": "storage_clients._redis._request_queue_client", + "name": "new_handled_request_count", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_redis/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 32 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3297, + "module": "storage_clients._redis._request_queue_client", + "name": "new_pending_request_count", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_redis/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 33 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3298, + "module": "storage_clients._redis._request_queue_client", + "name": "new_total_request_count", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_redis/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 34 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3299, + "module": "storage_clients._redis._request_queue_client", + "name": "delta_handled_request_count", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_redis/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 35 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3300, + "module": "storage_clients._redis._request_queue_client", + "name": "delta_pending_request_count", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_redis/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 36 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3301, + "module": "storage_clients._redis._request_queue_client", + "name": "delta_total_request_count", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_redis/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 37 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3302, + "module": "storage_clients._redis._request_queue_client", + "name": "recalculate", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_redis/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 38 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3303, + "module": "storage_clients._redis._request_queue_client", + "name": "update_had_multiple_clients", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_redis/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 39 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Parameters for updating queue metadata." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 3299, + 3300, + 3301, + 3296, + 3297, + 3298, + 3302, + 3303 + ], + "title": "Properties" + } + ], + "id": 3295, + "module": "storage_clients._redis._request_queue_client", + "name": "_QueueMetadataUpdateParams", + "parsedDocstring": { + "text": "Parameters for updating queue metadata." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_redis/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 29 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n\nPreferably use the `RedisRequestQueueClient.open` class method to create a new instance." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3305, + "module": "storage_clients._redis._request_queue_client", + "name": "__init__", + "parsedDocstring": { + "text": "Initialize a new instance.\n\nPreferably use the `RedisRequestQueueClient.open` class method to create a new instance." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_redis/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 87 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n\nPreferably use the `RedisRequestQueueClient.open` class method to create a new instance." + } + ] + }, + "flags": {}, + "id": 3306, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 3307, + "kind": 32768, + "kindString": "Parameter", + "name": "storage_name", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 3308, + "kind": 32768, + "kindString": "Parameter", + "name": "storage_id", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 3309, + "kind": 32768, + "kindString": "Parameter", + "name": "redis", + "type": { + "name": "Redis", + "type": "reference" + } + }, + { + "defaultValue": "'default'", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 3310, + "kind": 32768, + "kindString": "Parameter", + "name": "dedup_strategy", + "type": { + "name": "Literal", + "type": "reference", + "typeArguments": [ + { + "type": "literal", + "value": "default" + }, + { + "type": "literal", + "value": "bloom" + } + ] + } + }, + { + "defaultValue": "1e-7", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 3311, + "kind": 32768, + "kindString": "Parameter", + "name": "bloom_error_rate", + "type": { + "name": "float", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "RedisClientMixin.__init__", + "target": 3166, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RedisClientMixin.__init__", + "target": 3166, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Open or create a new Redis request queue client.\n\nThis method attempts to open an existing request queue from the Redis database. If a queue with the specified\nID or name exists, it loads the metadata from the database. If no existing queue is found, a new one\nis created.\n" + } + ] + }, + "decorations": [ + { + "name": "classmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3312, + "module": "storage_clients._redis._request_queue_client", + "name": "open", + "parsedDocstring": { + "text": "Open or create a new Redis request queue client.\n\nThis method attempts to open an existing request queue from the Redis database. If a queue with the specified\nID or name exists, it loads the metadata from the database. If no existing queue is found, a new one\nis created.\n", + "args": { + "id": "The ID of the request queue. If not provided, a random ID will be generated.", + "name": "The name of the dataset for named (global scope) storages.", + "alias": "The alias of the dataset for unnamed (run scope) storages.", + "redis": "Redis client instance.", + "dedup_strategy": "Strategy for request queue deduplication. Options are:\n- 'default': Uses Redis sets for exact deduplication.\n- 'bloom': Uses Redis Bloom filters for probabilistic deduplication with lower memory usage. When using\nthis approach, there is a possibility 1e-7 that requests will be skipped in the queue.", + "bloom_error_rate": "Desired false positive rate for Bloom filter deduplication. Only relevant if\n`dedup_strategy` is set to 'bloom'.\n" + }, + "returns": "An instance for the opened or created storage client." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_redis/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 164 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "An instance for the opened or created storage client." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Open or create a new Redis request queue client.\n\nThis method attempts to open an existing request queue from the Redis database. If a queue with the specified\nID or name exists, it loads the metadata from the database. If no existing queue is found, a new one\nis created.\n" + } + ] + }, + "flags": {}, + "id": 3313, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "open", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The ID of the request queue. If not provided, a random ID will be generated." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 3314, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The name of the dataset for named (global scope) storages." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 3315, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The alias of the dataset for unnamed (run scope) storages." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 3316, + "kind": 32768, + "kindString": "Parameter", + "name": "alias", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Redis client instance." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 3317, + "kind": 32768, + "kindString": "Parameter", + "name": "redis", + "type": { + "name": "Redis", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Strategy for request queue deduplication. Options are:\n- 'default': Uses Redis sets for exact deduplication.\n- 'bloom': Uses Redis Bloom filters for probabilistic deduplication with lower memory usage. When using\nthis approach, there is a possibility 1e-7 that requests will be skipped in the queue." + } + ] + }, + "defaultValue": "'default'", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 3318, + "kind": 32768, + "kindString": "Parameter", + "name": "dedup_strategy", + "type": { + "name": "Literal", + "type": "reference", + "typeArguments": [ + { + "type": "literal", + "value": "default" + }, + { + "type": "literal", + "value": "bloom" + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Desired false positive rate for Bloom filter deduplication. Only relevant if\n`dedup_strategy` is set to 'bloom'.\n" + } + ] + }, + "defaultValue": "1e-7", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 3319, + "kind": 32768, + "kindString": "Parameter", + "name": "bloom_error_rate", + "type": { + "name": "float", + "type": "reference" + } + } + ], + "type": { + "name": "RedisRequestQueueClient", + "type": "reference", + "target": "3304" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the metadata of the request queue." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3320, + "module": "storage_clients._base._request_queue_client", + "name": "get_metadata", + "parsedDocstring": { + "text": "Get the metadata of the request queue." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_redis/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 211 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the metadata of the request queue." + } + ] + }, + "flags": {}, + "id": 2761, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_metadata", + "parameters": [], + "type": { + "name": "RequestQueueMetadata", + "type": "reference", + "target": "3644" + }, + "overwrites": { + "name": "RequestQueueClient.get_metadata", + "target": 2760, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RequestQueueClient.get_metadata", + "target": 2760, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Drop the whole request queue and remove all its values.\n\nThe backend method for the `RequestQueue.drop` call." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3322, + "module": "storage_clients._base._request_queue_client", + "name": "drop", + "parsedDocstring": { + "text": "Drop the whole request queue and remove all its values.\n\nThe backend method for the `RequestQueue.drop` call." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_redis/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 215 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Drop the whole request queue and remove all its values.\n\nThe backend method for the `RequestQueue.drop` call." + } + ] + }, + "flags": {}, + "id": 2763, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "drop", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "RequestQueueClient.drop", + "target": 2762, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RequestQueueClient.drop", + "target": 2762, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Purge all items from the request queue.\n\nThe backend method for the `RequestQueue.purge` call." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3324, + "module": "storage_clients._base._request_queue_client", + "name": "purge", + "parsedDocstring": { + "text": "Purge all items from the request queue.\n\nThe backend method for the `RequestQueue.purge` call." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_redis/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 226 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Purge all items from the request queue.\n\nThe backend method for the `RequestQueue.purge` call." + } + ] + }, + "flags": {}, + "id": 2765, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "purge", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "RequestQueueClient.purge", + "target": 2764, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RequestQueueClient.purge", + "target": 2764, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Add batch of requests to the queue.\n\nThis method adds a batch of requests to the queue. Each request is processed based on its uniqueness\n(determined by `unique_key`). Duplicates will be identified but not re-added to the queue.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3326, + "module": "storage_clients._base._request_queue_client", + "name": "add_batch_of_requests", + "parsedDocstring": { + "text": "Add batch of requests to the queue.\n\nThis method adds a batch of requests to the queue. Each request is processed based on its uniqueness\n(determined by `unique_key`). Duplicates will be identified but not re-added to the queue.\n", + "args": { + "requests": "The collection of requests to add to the queue.", + "forefront": "Whether to put the added requests at the beginning (True) or the end (False) of the queue.\nWhen True, the requests will be processed sooner than previously added requests.", + "batch_size": "The maximum number of requests to add in a single batch.", + "wait_time_between_batches": "The time to wait between adding batches of requests.", + "wait_for_all_requests_to_be_added": "If True, the method will wait until all requests are added\nto the queue before returning.", + "wait_for_all_requests_to_be_added_timeout": "The maximum time to wait for all requests to be added.\n" + }, + "returns": "A response object containing information about which requests were successfully\nprocessed and which failed (if any)." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_redis/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 246 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "A response object containing information about which requests were successfully\nprocessed and which failed (if any)." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Add batch of requests to the queue.\n\nThis method adds a batch of requests to the queue. Each request is processed based on its uniqueness\n(determined by `unique_key`). Duplicates will be identified but not re-added to the queue.\n" + } + ] + }, + "flags": {}, + "id": 2767, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "add_batch_of_requests", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The collection of requests to add to the queue." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2768, + "kind": 32768, + "kindString": "Parameter", + "name": "requests", + "type": { + "name": "Sequence", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Request", + "target": "150" + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Whether to put the added requests at the beginning (True) or the end (False) of the queue.\nWhen True, the requests will be processed sooner than previously added requests." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2769, + "kind": 32768, + "kindString": "Parameter", + "name": "forefront", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "AddRequestsResponse", + "type": "reference", + "target": "3676" + }, + "overwrites": { + "name": "RequestQueueClient.add_batch_of_requests", + "target": 2766, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RequestQueueClient.add_batch_of_requests", + "target": 2766, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the next request in the queue to be processed.\n\nOnce you successfully finish processing of the request, you need to call `RequestQueue.mark_request_as_handled`\nto mark the request as handled in the queue. If there was some error in processing the request, call\n`RequestQueue.reclaim_request` instead, so that the queue will give the request to some other consumer\nin another call to the `fetch_next_request` method.\n\nNote that the `None` return value does not mean the queue processing finished, it means there are currently\nno pending requests. To check whether all requests in queue were finished, use `RequestQueue.is_finished`\ninstead.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3330, + "module": "storage_clients._base._request_queue_client", + "name": "fetch_next_request", + "parsedDocstring": { + "text": "Return the next request in the queue to be processed.\n\nOnce you successfully finish processing of the request, you need to call `RequestQueue.mark_request_as_handled`\nto mark the request as handled in the queue. If there was some error in processing the request, call\n`RequestQueue.reclaim_request` instead, so that the queue will give the request to some other consumer\nin another call to the `fetch_next_request` method.\n\nNote that the `None` return value does not mean the queue processing finished, it means there are currently\nno pending requests. To check whether all requests in queue were finished, use `RequestQueue.is_finished`\ninstead.\n", + "returns": "The request or `None` if there are no more pending requests." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_redis/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 353 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "The request or `None` if there are no more pending requests." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Return the next request in the queue to be processed.\n\nOnce you successfully finish processing of the request, you need to call `RequestQueue.mark_request_as_handled`\nto mark the request as handled in the queue. If there was some error in processing the request, call\n`RequestQueue.reclaim_request` instead, so that the queue will give the request to some other consumer\nin another call to the `fetch_next_request` method.\n\nNote that the `None` return value does not mean the queue processing finished, it means there are currently\nno pending requests. To check whether all requests in queue were finished, use `RequestQueue.is_finished`\ninstead.\n" + } + ] + }, + "flags": {}, + "id": 2774, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "fetch_next_request", + "parameters": [], + "type": { + "name": "Request | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Request", + "target": "150" + }, + { + "type": "literal", + "value": null + } + ] + }, + "overwrites": { + "name": "RequestQueueClient.fetch_next_request", + "target": 2773, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RequestQueueClient.fetch_next_request", + "target": 2773, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Retrieve a request from the queue.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3332, + "module": "storage_clients._base._request_queue_client", + "name": "get_request", + "parsedDocstring": { + "text": "Retrieve a request from the queue.\n", + "args": { + "unique_key": "Unique key of the request to retrieve.\n" + }, + "returns": "The retrieved request, or None, if it did not exist." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_redis/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 381 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "The retrieved request, or None, if it did not exist." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Retrieve a request from the queue.\n" + } + ] + }, + "flags": {}, + "id": 2771, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_request", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Unique key of the request to retrieve.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2772, + "kind": 32768, + "kindString": "Parameter", + "name": "unique_key", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "Request | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Request", + "target": "150" + }, + { + "type": "literal", + "value": null + } + ] + }, + "overwrites": { + "name": "RequestQueueClient.get_request", + "target": 2770, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RequestQueueClient.get_request", + "target": 2770, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Mark a request as handled after successful processing.\n\nHandled requests will never again be returned by the `RequestQueue.fetch_next_request` method.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3335, + "module": "storage_clients._base._request_queue_client", + "name": "mark_request_as_handled", + "parsedDocstring": { + "text": "Mark a request as handled after successful processing.\n\nHandled requests will never again be returned by the `RequestQueue.fetch_next_request` method.\n", + "args": { + "request": "The request to mark as handled.\n" + }, + "returns": "Information about the queue operation. `None` if the given request was not in progress." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_redis/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 390 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "Information about the queue operation. `None` if the given request was not in progress." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Mark a request as handled after successful processing.\n\nHandled requests will never again be returned by the `RequestQueue.fetch_next_request` method.\n" + } + ] + }, + "flags": {}, + "id": 2776, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "mark_request_as_handled", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The request to mark as handled.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2777, + "kind": 32768, + "kindString": "Parameter", + "name": "request", + "type": { + "name": "Request", + "type": "reference", + "target": "150" + } + } + ], + "type": { + "name": "ProcessedRequest | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ProcessedRequest", + "target": "3665" + }, + { + "type": "literal", + "value": null + } + ] + }, + "overwrites": { + "name": "RequestQueueClient.mark_request_as_handled", + "target": 2775, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RequestQueueClient.mark_request_as_handled", + "target": 2775, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Reclaim a failed request back to the queue.\n\nThe request will be returned for processing later again by another call to `RequestQueue.fetch_next_request`.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3338, + "module": "storage_clients._base._request_queue_client", + "name": "reclaim_request", + "parsedDocstring": { + "text": "Reclaim a failed request back to the queue.\n\nThe request will be returned for processing later again by another call to `RequestQueue.fetch_next_request`.\n", + "args": { + "request": "The request to return to the queue.", + "forefront": "Whether to add the request to the head or the end of the queue.\n" + }, + "returns": "Information about the queue operation. `None` if the given request was not in progress." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_redis/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 428 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "Information about the queue operation. `None` if the given request was not in progress." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Reclaim a failed request back to the queue.\n\nThe request will be returned for processing later again by another call to `RequestQueue.fetch_next_request`.\n" + } + ] + }, + "flags": {}, + "id": 2779, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "reclaim_request", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The request to return to the queue." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2780, + "kind": 32768, + "kindString": "Parameter", + "name": "request", + "type": { + "name": "Request", + "type": "reference", + "target": "150" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Whether to add the request to the head or the end of the queue.\n" + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2781, + "kind": 32768, + "kindString": "Parameter", + "name": "forefront", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "ProcessedRequest | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ProcessedRequest", + "target": "3665" + }, + { + "type": "literal", + "value": null + } + ] + }, + "overwrites": { + "name": "RequestQueueClient.reclaim_request", + "target": 2778, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RequestQueueClient.reclaim_request", + "target": 2778, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Check if the queue is empty.\n" + } + ] + }, + "decorations": [ + { + "name": "override" + } + ], + "flags": {}, + "groups": [], + "id": 3342, + "module": "storage_clients._redis._request_queue_client", + "name": "is_empty", + "parsedDocstring": { + "text": "Check if the queue is empty.\n", + "returns": "True if the queue is empty, False otherwise." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_redis/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 473 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "True if the queue is empty, False otherwise." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Check if the queue is empty.\n" + } + ] + }, + "flags": {}, + "id": 3343, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "is_empty", + "parameters": [], + "type": { + "name": "bool", + "type": "reference" + }, + "overwrites": { + "name": "RequestQueueClient.is_empty", + "target": 2782, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RequestQueueClient.is_empty", + "target": 2782, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the Redis client instance." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 3981, + "module": "storage_clients._redis._client_mixin", + "name": "redis", + "parsedDocstring": { + "text": "Return the Redis client instance." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_redis/_client_mixin.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 57 + } + ], + "type": { + "name": "Redis", + "type": "reference" + }, + "inheritedFrom": { + "name": "RedisClientMixin.redis", + "target": 3171, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the Redis key for the metadata of this storage." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 3982, + "module": "storage_clients._redis._client_mixin", + "name": "metadata_key", + "parsedDocstring": { + "text": "Return the Redis key for the metadata of this storage." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_redis/_client_mixin.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 62 + } + ], + "type": { + "name": "str", + "type": "reference" + }, + "inheritedFrom": { + "name": "RedisClientMixin.metadata_key", + "target": 3172, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Redis implementation of the request queue client.\n\nThis client persists requests to Redis using multiple data structures for efficient queue operations,\ndeduplication, and concurrent access safety. Requests are stored with FIFO ordering and support\nboth regular and forefront (high-priority) insertion modes.\n\nThe implementation uses Bloom filters for efficient request deduplication and Redis lists for\nqueue operations. Request blocking and client coordination is handled through Redis hashes\nwith timestamp-based expiration for stale request recovery.\n\nThe request queue data is stored in Redis using the following key patterns:\n- `request_queues:{name}:queue` - Redis list for FIFO request ordering\n- `request_queues:{name}:data` - Redis hash storing serialized Request objects by unique_key\n- `request_queues:{name}:in_progress` - Redis hash tracking requests currently being processed\n- `request_queues:{name}:added_bloom_filter` - Bloom filter for added request deduplication (`bloom` dedup_strategy)\n- `request_queues:{name}:handled_bloom_filter` - Bloom filter for completed request tracking (`bloom`\ndedup_strategy)\n- `request_queues:{name}:pending_set` - Redis set for added request deduplication (`default` dedup_strategy)\n- `request_queues:{name}:handled_set` - Redis set for completed request tracking (`default` dedup_strategy)\n- `request_queues:{name}:metadata` - Redis JSON object containing queue metadata\n\nRequests are serialized to JSON for storage and maintain proper FIFO ordering through Redis list\noperations. The implementation provides concurrent access safety through atomic Lua scripts,\nBloom filter operations, and Redis's built-in atomicity guarantees for individual operations." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 3305, + 3326, + 3322, + 3330, + 3320, + 3332, + 3342, + 3335, + 3312, + 3324, + 3338 + ], + "title": "Methods" + }, + { + "children": [ + 3982, + 3981 + ], + "title": "Properties" + } + ], + "id": 3304, + "module": "storage_clients._redis._request_queue_client", + "name": "RedisRequestQueueClient", + "parsedDocstring": { + "text": "Redis implementation of the request queue client.\n\nThis client persists requests to Redis using multiple data structures for efficient queue operations,\ndeduplication, and concurrent access safety. Requests are stored with FIFO ordering and support\nboth regular and forefront (high-priority) insertion modes.\n\nThe implementation uses Bloom filters for efficient request deduplication and Redis lists for\nqueue operations. Request blocking and client coordination is handled through Redis hashes\nwith timestamp-based expiration for stale request recovery.\n\nThe request queue data is stored in Redis using the following key patterns:\n- `request_queues:{name}:queue` - Redis list for FIFO request ordering\n- `request_queues:{name}:data` - Redis hash storing serialized Request objects by unique_key\n- `request_queues:{name}:in_progress` - Redis hash tracking requests currently being processed\n- `request_queues:{name}:added_bloom_filter` - Bloom filter for added request deduplication (`bloom` dedup_strategy)\n- `request_queues:{name}:handled_bloom_filter` - Bloom filter for completed request tracking (`bloom`\ndedup_strategy)\n- `request_queues:{name}:pending_set` - Redis set for added request deduplication (`default` dedup_strategy)\n- `request_queues:{name}:handled_set` - Redis set for completed request tracking (`default` dedup_strategy)\n- `request_queues:{name}:metadata` - Redis JSON object containing queue metadata\n\nRequests are serialized to JSON for storage and maintain proper FIFO ordering through Redis list\noperations. The implementation provides concurrent access safety through atomic Lua scripts,\nBloom filter operations, and Redis's built-in atomicity guarantees for individual operations." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_redis/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 42 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "RedisClientMixin", + "target": "3165", + "type": "reference" + }, + { + "name": "RequestQueueClient", + "target": "2759", + "type": "reference" + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3344, + "module": "storage_clients._sql._dataset_client", + "name": "logger", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_dataset_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 27 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3346, + "module": "storage_clients._sql._dataset_client", + "name": "new_item_count", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_dataset_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 33 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3347, + "module": "storage_clients._sql._dataset_client", + "name": "delta_item_count", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_dataset_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 34 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3983, + "module": "storage_clients._sql._client_mixin", + "name": "accessed_at", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_client_mixin.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 48 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "datetime" + } + ] + }, + "inheritedFrom": { + "name": "MetadataUpdateParams.accessed_at", + "target": 3535, + "type": "reference" + }, + "overwrites": { + "name": "MetadataUpdateParams.accessed_at", + "target": 3535, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3984, + "module": "storage_clients._sql._client_mixin", + "name": "modified_at", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_client_mixin.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 49 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "datetime" + } + ] + }, + "inheritedFrom": { + "name": "MetadataUpdateParams.modified_at", + "target": 3536, + "type": "reference" + }, + "overwrites": { + "name": "MetadataUpdateParams.modified_at", + "target": 3536, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Parameters for updating dataset metadata." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 3983, + 3347, + 3984, + 3346 + ], + "title": "Properties" + } + ], + "id": 3345, + "module": "storage_clients._sql._dataset_client", + "name": "_DatasetMetadataUpdateParams", + "parsedDocstring": { + "text": "Parameters for updating dataset metadata." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_dataset_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 30 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "MetadataUpdateParams", + "target": "3162", + "type": "reference" + }, + { + "name": "MetadataUpdateParams", + "target": "3162", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n\nPreferably use the `SqlDatasetClient.open` class method to create a new instance." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3349, + "module": "storage_clients._sql._dataset_client", + "name": "__init__", + "parsedDocstring": { + "text": "Initialize a new instance.\n\nPreferably use the `SqlDatasetClient.open` class method to create a new instance." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_dataset_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 68 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n\nPreferably use the `SqlDatasetClient.open` class method to create a new instance." + } + ] + }, + "flags": {}, + "id": 3350, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 3351, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 3352, + "kind": 32768, + "kindString": "Parameter", + "name": "storage_client", + "type": { + "name": "SqlStorageClient", + "type": "reference", + "target": "3591" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "SqlClientMixin.__init__", + "target": 3538, + "type": "reference" + } + } + ], + "overwrites": { + "name": "SqlClientMixin.__init__", + "target": 3538, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Open an existing dataset or create a new one.\n" + } + ] + }, + "decorations": [ + { + "name": "classmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3353, + "module": "storage_clients._sql._dataset_client", + "name": "open", + "parsedDocstring": { + "text": "Open an existing dataset or create a new one.\n", + "args": { + "id": "The ID of the dataset to open. If provided, searches for existing dataset by ID.", + "name": "The name of the dataset for named (global scope) storages.", + "alias": "The alias of the dataset for unnamed (run scope) storages.", + "storage_client": "The SQL storage client instance.\n" + }, + "returns": "An instance for the opened or created storage client." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_dataset_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 81 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "An instance for the opened or created storage client." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Open an existing dataset or create a new one.\n" + } + ] + }, + "flags": {}, + "id": 3354, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "open", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The ID of the dataset to open. If provided, searches for existing dataset by ID." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 3355, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The name of the dataset for named (global scope) storages." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 3356, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The alias of the dataset for unnamed (run scope) storages." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 3357, + "kind": 32768, + "kindString": "Parameter", + "name": "alias", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The SQL storage client instance.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 3358, + "kind": 32768, + "kindString": "Parameter", + "name": "storage_client", + "type": { + "name": "SqlStorageClient", + "type": "reference", + "target": "3591" + } + } + ], + "type": { + "name": "Self", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the metadata of the dataset." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3359, + "module": "storage_clients._base._dataset_client", + "name": "get_metadata", + "parsedDocstring": { + "text": "Get the metadata of the dataset." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_dataset_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 113 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the metadata of the dataset." + } + ] + }, + "flags": {}, + "id": 2810, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_metadata", + "parameters": [], + "type": { + "name": "DatasetMetadata", + "type": "reference", + "target": "3639" + }, + "overwrites": { + "name": "DatasetClient.get_metadata", + "target": 2809, + "type": "reference" + } + } + ], + "overwrites": { + "name": "DatasetClient.get_metadata", + "target": 2809, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Delete this dataset and all its items from the database.\n\nThis operation is irreversible. Uses CASCADE deletion to remove all related items." + } + ] + }, + "decorations": [ + { + "name": "override" + } + ], + "flags": {}, + "groups": [], + "id": 3361, + "module": "storage_clients._sql._dataset_client", + "name": "drop", + "parsedDocstring": { + "text": "Delete this dataset and all its items from the database.\n\nThis operation is irreversible. Uses CASCADE deletion to remove all related items." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_dataset_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 118 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Delete this dataset and all its items from the database.\n\nThis operation is irreversible. Uses CASCADE deletion to remove all related items." + } + ] + }, + "flags": {}, + "id": 3362, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "drop", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "DatasetClient.drop", + "target": 2811, + "type": "reference" + } + } + ], + "overwrites": { + "name": "DatasetClient.drop", + "target": 2811, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Remove all items from this dataset while keeping the dataset structure.\n\nResets item_count to 0 and deletes all records from dataset_records table." + } + ] + }, + "decorations": [ + { + "name": "override" + } + ], + "flags": {}, + "groups": [], + "id": 3363, + "module": "storage_clients._sql._dataset_client", + "name": "purge", + "parsedDocstring": { + "text": "Remove all items from this dataset while keeping the dataset structure.\n\nResets item_count to 0 and deletes all records from dataset_records table." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_dataset_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 126 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Remove all items from this dataset while keeping the dataset structure.\n\nResets item_count to 0 and deletes all records from dataset_records table." + } + ] + }, + "flags": {}, + "id": 3364, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "purge", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "DatasetClient.purge", + "target": 2813, + "type": "reference" + } + } + ], + "overwrites": { + "name": "DatasetClient.purge", + "target": 2813, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Push data to the dataset.\n\nThe backend method for the `Dataset.push_data` call." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3365, + "module": "storage_clients._base._dataset_client", + "name": "push_data", + "parsedDocstring": { + "text": "Push data to the dataset.\n\nThe backend method for the `Dataset.push_data` call." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_dataset_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 141 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Push data to the dataset.\n\nThe backend method for the `Dataset.push_data` call." + } + ] + }, + "flags": {}, + "id": 2816, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "push_data", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2817, + "kind": 32768, + "kindString": "Parameter", + "name": "data", + "type": { + "name": "list[Any] | dict[str, Any]", + "type": "union", + "types": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "Any" + } + ] + }, + { + "type": "reference", + "name": "dict", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Any" + } + ] + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "DatasetClient.push_data", + "target": 2815, + "type": "reference" + } + } + ], + "overwrites": { + "name": "DatasetClient.push_data", + "target": 2815, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get data from the dataset with various filtering options.\n\nThe backend method for the `Dataset.get_data` call." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3368, + "module": "storage_clients._base._dataset_client", + "name": "get_data", + "parsedDocstring": { + "text": "Get data from the dataset with various filtering options.\n\nThe backend method for the `Dataset.get_data` call." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_dataset_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 154 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get data from the dataset with various filtering options.\n\nThe backend method for the `Dataset.get_data` call." + } + ] + }, + "flags": {}, + "id": 2819, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_data", + "parameters": [ + { + "defaultValue": "0", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2820, + "kind": 32768, + "kindString": "Parameter", + "name": "offset", + "type": { + "name": "int", + "type": "reference" + } + }, + { + "defaultValue": "999_999_999_999", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2821, + "kind": 32768, + "kindString": "Parameter", + "name": "limit", + "type": { + "name": "int | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2822, + "kind": 32768, + "kindString": "Parameter", + "name": "clean", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2823, + "kind": 32768, + "kindString": "Parameter", + "name": "desc", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2824, + "kind": 32768, + "kindString": "Parameter", + "name": "fields", + "type": { + "name": "list[str] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2825, + "kind": 32768, + "kindString": "Parameter", + "name": "omit", + "type": { + "name": "list[str] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2826, + "kind": 32768, + "kindString": "Parameter", + "name": "unwind", + "type": { + "name": "list[str] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2827, + "kind": 32768, + "kindString": "Parameter", + "name": "skip_empty", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2828, + "kind": 32768, + "kindString": "Parameter", + "name": "skip_hidden", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2829, + "kind": 32768, + "kindString": "Parameter", + "name": "flatten", + "type": { + "name": "list[str] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2830, + "kind": 32768, + "kindString": "Parameter", + "name": "view", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "DatasetItemsListPage", + "type": "reference", + "target": "3658" + }, + "overwrites": { + "name": "DatasetClient.get_data", + "target": 2818, + "type": "reference" + } + } + ], + "overwrites": { + "name": "DatasetClient.get_data", + "target": 2818, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Iterate over the dataset items with filtering options.\n\nThe backend method for the `Dataset.iterate_items` call." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3381, + "module": "storage_clients._base._dataset_client", + "name": "iterate_items", + "parsedDocstring": { + "text": "Iterate over the dataset items with filtering options.\n\nThe backend method for the `Dataset.iterate_items` call." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_dataset_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 201 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Iterate over the dataset items with filtering options.\n\nThe backend method for the `Dataset.iterate_items` call." + } + ] + }, + "flags": {}, + "id": 2832, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "iterate_items", + "parameters": [ + { + "defaultValue": "0", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2833, + "kind": 32768, + "kindString": "Parameter", + "name": "offset", + "type": { + "name": "int", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2834, + "kind": 32768, + "kindString": "Parameter", + "name": "limit", + "type": { + "name": "int | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2835, + "kind": 32768, + "kindString": "Parameter", + "name": "clean", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2836, + "kind": 32768, + "kindString": "Parameter", + "name": "desc", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2837, + "kind": 32768, + "kindString": "Parameter", + "name": "fields", + "type": { + "name": "list[str] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2838, + "kind": 32768, + "kindString": "Parameter", + "name": "omit", + "type": { + "name": "list[str] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2839, + "kind": 32768, + "kindString": "Parameter", + "name": "unwind", + "type": { + "name": "list[str] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2840, + "kind": 32768, + "kindString": "Parameter", + "name": "skip_empty", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2841, + "kind": 32768, + "kindString": "Parameter", + "name": "skip_hidden", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "AsyncIterator", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "dict", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Any" + } + ] + } + ] + }, + "overwrites": { + "name": "DatasetClient.iterate_items", + "target": 2831, + "type": "reference" + } + } + ], + "overwrites": { + "name": "DatasetClient.iterate_items", + "target": 2831, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a new SQLAlchemy session for this storage." + } + ] + }, + "decorations": [ + { + "name": "asynccontextmanager" + } + ], + "flags": {}, + "groups": [], + "id": 3974, + "module": "storage_clients._sql._client_mixin", + "name": "get_session", + "parsedDocstring": { + "text": "Create a new SQLAlchemy session for this storage." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_client_mixin.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 197 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a new SQLAlchemy session for this storage." + } + ] + }, + "flags": {}, + "id": 3543, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_session", + "parameters": [ + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 3544, + "kind": 32768, + "kindString": "Parameter", + "name": "with_simple_commit", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "AsyncIterator[AsyncSession]", + "type": "reference" + }, + "inheritedFrom": { + "name": "SqlClientMixin.get_session", + "target": 3542, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "SqlClientMixin.get_session", + "target": 3542, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "SQL implementation of the dataset client.\n\nThis client persists dataset items to a SQL database using two tables for storage\nand retrieval. Items are stored as JSON with automatic ordering preservation.\n\nThe dataset data is stored in SQL database tables following the pattern:\n- `datasets` table: Contains dataset metadata (id, name, timestamps, item_count)\n- `dataset_records` table: Contains individual items with JSON data and auto-increment ordering\n- `dataset_metadata_buffer` table: Buffers metadata updates for performance optimization\n\nItems are stored as a JSON object in SQLite and as JSONB in PostgreSQL. These objects must be JSON-serializable.\nThe `item_id` auto-increment primary key ensures insertion order is preserved.\nAll operations are wrapped in database transactions with CASCADE deletion support." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 3349, + 3361, + 3368, + 3359, + 3974, + 3381, + 3353, + 3363, + 3365 + ], + "title": "Methods" + } + ], + "id": 3348, + "module": "storage_clients._sql._dataset_client", + "name": "SqlDatasetClient", + "parsedDocstring": { + "text": "SQL implementation of the dataset client.\n\nThis client persists dataset items to a SQL database using two tables for storage\nand retrieval. Items are stored as JSON with automatic ordering preservation.\n\nThe dataset data is stored in SQL database tables following the pattern:\n- `datasets` table: Contains dataset metadata (id, name, timestamps, item_count)\n- `dataset_records` table: Contains individual items with JSON data and auto-increment ordering\n- `dataset_metadata_buffer` table: Buffers metadata updates for performance optimization\n\nItems are stored as a JSON object in SQLite and as JSONB in PostgreSQL. These objects must be JSON-serializable.\nThe `item_id` auto-increment primary key ensures insertion order is preserved.\nAll operations are wrapped in database transactions with CASCADE deletion support." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_dataset_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 37 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "SqlClientMixin", + "target": "3537", + "type": "reference" + }, + { + "name": "DatasetClient", + "target": "2808", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3393, + "module": "storage_clients._sql._db_models", + "name": "impl", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 24 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3394, + "module": "storage_clients._sql._db_models", + "name": "cache_ok", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 25 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Add UTC timezone to naive datetime values." + } + ] + }, + "decorations": [ + { + "name": "override" + } + ], + "flags": {}, + "groups": [], + "id": 3395, + "module": "storage_clients._sql._db_models", + "name": "process_result_value", + "parsedDocstring": { + "text": "Add UTC timezone to naive datetime values." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 28 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Add UTC timezone to naive datetime values." + } + ] + }, + "flags": {}, + "id": 3396, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "process_result_value", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 3397, + "kind": 32768, + "kindString": "Parameter", + "name": "value", + "type": { + "name": "datetime | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "datetime" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 3398, + "kind": 32768, + "kindString": "Parameter", + "name": "dialect", + "type": { + "name": "Dialect", + "type": "reference" + } + } + ], + "type": { + "name": "datetime | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "datetime" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Custom SQLAlchemy type for timezone-aware datetime handling.\n\nEnsures all datetime values are timezone-aware by adding UTC timezone to\nnaive datetime values from databases that don't store timezone information." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 3395 + ], + "title": "Methods" + }, + { + "children": [ + 3394, + 3393 + ], + "title": "Properties" + } + ], + "id": 3392, + "module": "storage_clients._sql._db_models", + "name": "AwareDateTime", + "parsedDocstring": { + "text": "Custom SQLAlchemy type for timezone-aware datetime handling.\n\nEnsures all datetime values are timezone-aware by adding UTC timezone to\nnaive datetime values from databases that don't store timezone information." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 17 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3400, + "module": "storage_clients._sql._db_models", + "name": "impl", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 38 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3401, + "module": "storage_clients._sql._db_models", + "name": "cache_ok", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 39 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Load the appropriate dialect implementation for the JSON type." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3402, + "module": "storage_clients._sql._db_models", + "name": "load_dialect_impl", + "parsedDocstring": { + "text": "Load the appropriate dialect implementation for the JSON type." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 41 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Load the appropriate dialect implementation for the JSON type." + } + ] + }, + "flags": {}, + "id": 3403, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "load_dialect_impl", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 3404, + "kind": 32768, + "kindString": "Parameter", + "name": "dialect", + "type": { + "name": "Dialect", + "type": "reference" + } + } + ], + "type": { + "name": "TypeEngine", + "type": "reference", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "JSON" + }, + { + "type": "reference", + "name": "JSONB" + } + ] + } + ] + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Uses JSONB for PostgreSQL and JSON for other databases." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 3402 + ], + "title": "Methods" + }, + { + "children": [ + 3401, + 3400 + ], + "title": "Properties" + } + ], + "id": 3399, + "module": "storage_clients._sql._db_models", + "name": "JsonField", + "parsedDocstring": { + "text": "Uses JSONB for PostgreSQL and JSON for other databases." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 35 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Base class for all database models for correct type annotations." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3405, + "module": "storage_clients._sql._db_models", + "name": "Base", + "parsedDocstring": { + "text": "Base class for all database models for correct type annotations." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 48 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedBy": [ + { + "name": "DatasetMetadataDb", + "target": "3413", + "type": "reference" + }, + { + "name": "RequestQueueMetadataDb", + "target": "3419", + "type": "reference" + }, + { + "name": "KeyValueStoreMetadataDb", + "target": "3429", + "type": "reference" + }, + { + "name": "KeyValueStoreRecordDb", + "target": "3434", + "type": "reference" + }, + { + "name": "DatasetItemDb", + "target": "3443", + "type": "reference" + }, + { + "name": "RequestDb", + "target": "3450", + "type": "reference" + }, + { + "name": "RequestQueueStateDb", + "target": "3462", + "type": "reference" + }, + { + "name": "VersionDb", + "target": "3468", + "type": "reference" + }, + { + "name": "KeyValueStoreMetadataBufferDb", + "target": "3475", + "type": "reference" + }, + { + "name": "DatasetMetadataBufferDb", + "target": "3479", + "type": "reference" + }, + { + "name": "RequestQueueMetadataBufferDb", + "target": "3484", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Internal unique name for a storage instance based on a name or alias." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3407, + "module": "storage_clients._sql._db_models", + "name": "internal_name", + "parsedDocstring": { + "text": "Internal unique name for a storage instance based on a name or alias." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 55 + } + ], + "type": { + "name": "Mapped", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Human-readable name. None becomes 'default' in database to enforce uniqueness." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3408, + "module": "storage_clients._sql._db_models", + "name": "name", + "parsedDocstring": { + "text": "Human-readable name. None becomes 'default' in database to enforce uniqueness." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 58 + } + ], + "type": { + "name": "Mapped", + "type": "reference", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Last access datetime for usage tracking." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3409, + "module": "storage_clients._sql._db_models", + "name": "accessed_at", + "parsedDocstring": { + "text": "Last access datetime for usage tracking." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 61 + } + ], + "type": { + "name": "Mapped", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "datetime" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Creation datetime." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3410, + "module": "storage_clients._sql._db_models", + "name": "created_at", + "parsedDocstring": { + "text": "Creation datetime." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 64 + } + ], + "type": { + "name": "Mapped", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "datetime" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Last modification datetime." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3411, + "module": "storage_clients._sql._db_models", + "name": "modified_at", + "parsedDocstring": { + "text": "Last modification datetime." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 67 + } + ], + "type": { + "name": "Mapped", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "datetime" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Timestamp until which buffer processing is locked for this storage. NULL = unlocked." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3412, + "module": "storage_clients._sql._db_models", + "name": "buffer_locked_until", + "parsedDocstring": { + "text": "Timestamp until which buffer processing is locked for this storage. NULL = unlocked." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 70 + } + ], + "type": { + "name": "Mapped", + "type": "reference", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "datetime" + }, + { + "type": "literal", + "value": null + } + ] + } + ] + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Base database model for storage metadata." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 3409, + 3412, + 3410, + 3407, + 3411, + 3408 + ], + "title": "Properties" + } + ], + "id": 3406, + "module": "storage_clients._sql._db_models", + "name": "StorageMetadataDb", + "parsedDocstring": { + "text": "Base database model for storage metadata." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 52 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedBy": [ + { + "name": "DatasetMetadataDb", + "target": "3413", + "type": "reference" + }, + { + "name": "RequestQueueMetadataDb", + "target": "3419", + "type": "reference" + }, + { + "name": "KeyValueStoreMetadataDb", + "target": "3429", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3414, + "module": "storage_clients._sql._db_models", + "name": "__tablename__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 77 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Unique identifier for the dataset." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3415, + "module": "storage_clients._sql._db_models", + "name": "dataset_id", + "parsedDocstring": { + "text": "Unique identifier for the dataset." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 79 + } + ], + "type": { + "name": "Mapped", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Number of items in the dataset." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3416, + "module": "storage_clients._sql._db_models", + "name": "item_count", + "parsedDocstring": { + "text": "Number of items in the dataset." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 82 + } + ], + "type": { + "name": "Mapped", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3417, + "module": "storage_clients._sql._db_models", + "name": "items", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 86 + } + ], + "type": { + "name": "Mapped", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "DatasetItemDb", + "target": "3443" + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Alias for dataset_id to match Pydantic expectations." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3418, + "module": "storage_clients._sql._db_models", + "name": "id", + "parsedDocstring": { + "text": "Alias for dataset_id to match Pydantic expectations." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 90 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Internal unique name for a storage instance based on a name or alias." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3956, + "module": "storage_clients._sql._db_models", + "name": "internal_name", + "parsedDocstring": { + "text": "Internal unique name for a storage instance based on a name or alias." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 55 + } + ], + "type": { + "name": "Mapped[str]", + "type": "reference" + }, + "inheritedFrom": { + "name": "StorageMetadataDb.internal_name", + "target": 3407, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Human-readable name. None becomes 'default' in database to enforce uniqueness." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3957, + "module": "storage_clients._sql._db_models", + "name": "name", + "parsedDocstring": { + "text": "Human-readable name. None becomes 'default' in database to enforce uniqueness." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 58 + } + ], + "type": { + "name": "Mapped[str | None]", + "type": "reference" + }, + "inheritedFrom": { + "name": "StorageMetadataDb.name", + "target": 3408, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Last access datetime for usage tracking." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3958, + "module": "storage_clients._sql._db_models", + "name": "accessed_at", + "parsedDocstring": { + "text": "Last access datetime for usage tracking." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 61 + } + ], + "type": { + "name": "Mapped[datetime]", + "type": "reference" + }, + "inheritedFrom": { + "name": "StorageMetadataDb.accessed_at", + "target": 3409, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Creation datetime." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3959, + "module": "storage_clients._sql._db_models", + "name": "created_at", + "parsedDocstring": { + "text": "Creation datetime." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 64 + } + ], + "type": { + "name": "Mapped[datetime]", + "type": "reference" + }, + "inheritedFrom": { + "name": "StorageMetadataDb.created_at", + "target": 3410, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Last modification datetime." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3960, + "module": "storage_clients._sql._db_models", + "name": "modified_at", + "parsedDocstring": { + "text": "Last modification datetime." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 67 + } + ], + "type": { + "name": "Mapped[datetime]", + "type": "reference" + }, + "inheritedFrom": { + "name": "StorageMetadataDb.modified_at", + "target": 3411, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Timestamp until which buffer processing is locked for this storage. NULL = unlocked." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3961, + "module": "storage_clients._sql._db_models", + "name": "buffer_locked_until", + "parsedDocstring": { + "text": "Timestamp until which buffer processing is locked for this storage. NULL = unlocked." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 70 + } + ], + "type": { + "name": "Mapped[datetime | None]", + "type": "reference" + }, + "inheritedFrom": { + "name": "StorageMetadataDb.buffer_locked_until", + "target": 3412, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Metadata table for datasets." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 3414, + 3958, + 3961, + 3959, + 3415, + 3418, + 3956, + 3416, + 3417, + 3960, + 3957 + ], + "title": "Properties" + } + ], + "id": 3413, + "module": "storage_clients._sql._db_models", + "name": "DatasetMetadataDb", + "parsedDocstring": { + "text": "Metadata table for datasets." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 74 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "Base", + "target": "3405", + "type": "reference" + }, + { + "name": "StorageMetadataDb", + "target": "3406", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3420, + "module": "storage_clients._sql._db_models", + "name": "__tablename__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 97 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Unique identifier for the request queue." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3421, + "module": "storage_clients._sql._db_models", + "name": "request_queue_id", + "parsedDocstring": { + "text": "Unique identifier for the request queue." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 99 + } + ], + "type": { + "name": "Mapped", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Flag indicating if multiple clients have accessed this queue." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3422, + "module": "storage_clients._sql._db_models", + "name": "had_multiple_clients", + "parsedDocstring": { + "text": "Flag indicating if multiple clients have accessed this queue." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 102 + } + ], + "type": { + "name": "Mapped", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Number of requests processed." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3423, + "module": "storage_clients._sql._db_models", + "name": "handled_request_count", + "parsedDocstring": { + "text": "Number of requests processed." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 105 + } + ], + "type": { + "name": "Mapped", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Number of requests waiting to be processed." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3424, + "module": "storage_clients._sql._db_models", + "name": "pending_request_count", + "parsedDocstring": { + "text": "Number of requests waiting to be processed." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 108 + } + ], + "type": { + "name": "Mapped", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Total number of requests ever added to this queue." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3425, + "module": "storage_clients._sql._db_models", + "name": "total_request_count", + "parsedDocstring": { + "text": "Total number of requests ever added to this queue." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 111 + } + ], + "type": { + "name": "Mapped", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3426, + "module": "storage_clients._sql._db_models", + "name": "requests", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 115 + } + ], + "type": { + "name": "Mapped", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "RequestDb", + "target": "3450" + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3427, + "module": "storage_clients._sql._db_models", + "name": "state", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 119 + } + ], + "type": { + "name": "Mapped", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "RequestQueueStateDb", + "target": "3462" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Alias for request_queue_id to match Pydantic expectations." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3428, + "module": "storage_clients._sql._db_models", + "name": "id", + "parsedDocstring": { + "text": "Alias for request_queue_id to match Pydantic expectations." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 123 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Internal unique name for a storage instance based on a name or alias." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3962, + "module": "storage_clients._sql._db_models", + "name": "internal_name", + "parsedDocstring": { + "text": "Internal unique name for a storage instance based on a name or alias." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 55 + } + ], + "type": { + "name": "Mapped[str]", + "type": "reference" + }, + "inheritedFrom": { + "name": "StorageMetadataDb.internal_name", + "target": 3407, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Human-readable name. None becomes 'default' in database to enforce uniqueness." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3963, + "module": "storage_clients._sql._db_models", + "name": "name", + "parsedDocstring": { + "text": "Human-readable name. None becomes 'default' in database to enforce uniqueness." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 58 + } + ], + "type": { + "name": "Mapped[str | None]", + "type": "reference" + }, + "inheritedFrom": { + "name": "StorageMetadataDb.name", + "target": 3408, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Last access datetime for usage tracking." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3964, + "module": "storage_clients._sql._db_models", + "name": "accessed_at", + "parsedDocstring": { + "text": "Last access datetime for usage tracking." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 61 + } + ], + "type": { + "name": "Mapped[datetime]", + "type": "reference" + }, + "inheritedFrom": { + "name": "StorageMetadataDb.accessed_at", + "target": 3409, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Creation datetime." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3965, + "module": "storage_clients._sql._db_models", + "name": "created_at", + "parsedDocstring": { + "text": "Creation datetime." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 64 + } + ], + "type": { + "name": "Mapped[datetime]", + "type": "reference" + }, + "inheritedFrom": { + "name": "StorageMetadataDb.created_at", + "target": 3410, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Last modification datetime." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3966, + "module": "storage_clients._sql._db_models", + "name": "modified_at", + "parsedDocstring": { + "text": "Last modification datetime." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 67 + } + ], + "type": { + "name": "Mapped[datetime]", + "type": "reference" + }, + "inheritedFrom": { + "name": "StorageMetadataDb.modified_at", + "target": 3411, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Timestamp until which buffer processing is locked for this storage. NULL = unlocked." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3967, + "module": "storage_clients._sql._db_models", + "name": "buffer_locked_until", + "parsedDocstring": { + "text": "Timestamp until which buffer processing is locked for this storage. NULL = unlocked." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 70 + } + ], + "type": { + "name": "Mapped[datetime | None]", + "type": "reference" + }, + "inheritedFrom": { + "name": "StorageMetadataDb.buffer_locked_until", + "target": 3412, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Metadata table for request queues." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 3420, + 3964, + 3967, + 3965, + 3422, + 3423, + 3428, + 3962, + 3966, + 3963, + 3424, + 3421, + 3426, + 3427, + 3425 + ], + "title": "Properties" + } + ], + "id": 3419, + "module": "storage_clients._sql._db_models", + "name": "RequestQueueMetadataDb", + "parsedDocstring": { + "text": "Metadata table for request queues." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 94 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "Base", + "target": "3405", + "type": "reference" + }, + { + "name": "StorageMetadataDb", + "target": "3406", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3430, + "module": "storage_clients._sql._db_models", + "name": "__tablename__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 130 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Unique identifier for the key-value store." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3431, + "module": "storage_clients._sql._db_models", + "name": "key_value_store_id", + "parsedDocstring": { + "text": "Unique identifier for the key-value store." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 132 + } + ], + "type": { + "name": "Mapped", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3432, + "module": "storage_clients._sql._db_models", + "name": "records", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 136 + } + ], + "type": { + "name": "Mapped", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "KeyValueStoreRecordDb", + "target": "3434" + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Alias for key_value_store_id to match Pydantic expectations." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3433, + "module": "storage_clients._sql._db_models", + "name": "id", + "parsedDocstring": { + "text": "Alias for key_value_store_id to match Pydantic expectations." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 140 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Internal unique name for a storage instance based on a name or alias." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3968, + "module": "storage_clients._sql._db_models", + "name": "internal_name", + "parsedDocstring": { + "text": "Internal unique name for a storage instance based on a name or alias." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 55 + } + ], + "type": { + "name": "Mapped[str]", + "type": "reference" + }, + "inheritedFrom": { + "name": "StorageMetadataDb.internal_name", + "target": 3407, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Human-readable name. None becomes 'default' in database to enforce uniqueness." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3969, + "module": "storage_clients._sql._db_models", + "name": "name", + "parsedDocstring": { + "text": "Human-readable name. None becomes 'default' in database to enforce uniqueness." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 58 + } + ], + "type": { + "name": "Mapped[str | None]", + "type": "reference" + }, + "inheritedFrom": { + "name": "StorageMetadataDb.name", + "target": 3408, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Last access datetime for usage tracking." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3970, + "module": "storage_clients._sql._db_models", + "name": "accessed_at", + "parsedDocstring": { + "text": "Last access datetime for usage tracking." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 61 + } + ], + "type": { + "name": "Mapped[datetime]", + "type": "reference" + }, + "inheritedFrom": { + "name": "StorageMetadataDb.accessed_at", + "target": 3409, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Creation datetime." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3971, + "module": "storage_clients._sql._db_models", + "name": "created_at", + "parsedDocstring": { + "text": "Creation datetime." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 64 + } + ], + "type": { + "name": "Mapped[datetime]", + "type": "reference" + }, + "inheritedFrom": { + "name": "StorageMetadataDb.created_at", + "target": 3410, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Last modification datetime." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3972, + "module": "storage_clients._sql._db_models", + "name": "modified_at", + "parsedDocstring": { + "text": "Last modification datetime." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 67 + } + ], + "type": { + "name": "Mapped[datetime]", + "type": "reference" + }, + "inheritedFrom": { + "name": "StorageMetadataDb.modified_at", + "target": 3411, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Timestamp until which buffer processing is locked for this storage. NULL = unlocked." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3973, + "module": "storage_clients._sql._db_models", + "name": "buffer_locked_until", + "parsedDocstring": { + "text": "Timestamp until which buffer processing is locked for this storage. NULL = unlocked." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 70 + } + ], + "type": { + "name": "Mapped[datetime | None]", + "type": "reference" + }, + "inheritedFrom": { + "name": "StorageMetadataDb.buffer_locked_until", + "target": 3412, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Metadata table for key-value stores." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 3430, + 3970, + 3973, + 3971, + 3433, + 3968, + 3431, + 3972, + 3969, + 3432 + ], + "title": "Properties" + } + ], + "id": 3429, + "module": "storage_clients._sql._db_models", + "name": "KeyValueStoreMetadataDb", + "parsedDocstring": { + "text": "Metadata table for key-value stores." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 127 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "Base", + "target": "3405", + "type": "reference" + }, + { + "name": "StorageMetadataDb", + "target": "3406", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3435, + "module": "storage_clients._sql._db_models", + "name": "__tablename__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 147 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Foreign key to metadata key-value store record." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3436, + "module": "storage_clients._sql._db_models", + "name": "key_value_store_id", + "parsedDocstring": { + "text": "Foreign key to metadata key-value store record." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 149 + } + ], + "type": { + "name": "Mapped", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The key part of the key-value pair." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3437, + "module": "storage_clients._sql._db_models", + "name": "key", + "parsedDocstring": { + "text": "The key part of the key-value pair." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 158 + } + ], + "type": { + "name": "Mapped", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Value stored as binary data to support any content type." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3438, + "module": "storage_clients._sql._db_models", + "name": "value", + "parsedDocstring": { + "text": "Value stored as binary data to support any content type." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 161 + } + ], + "type": { + "name": "Mapped", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bytes", + "target": "682" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "MIME type for proper value deserialization." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3439, + "module": "storage_clients._sql._db_models", + "name": "content_type", + "parsedDocstring": { + "text": "MIME type for proper value deserialization." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 164 + } + ], + "type": { + "name": "Mapped", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Size of stored value in bytes." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3440, + "module": "storage_clients._sql._db_models", + "name": "size", + "parsedDocstring": { + "text": "Size of stored value in bytes." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 167 + } + ], + "type": { + "name": "Mapped", + "type": "reference", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3441, + "module": "storage_clients._sql._db_models", + "name": "kvs", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 171 + } + ], + "type": { + "name": "Mapped", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "KeyValueStoreMetadataDb", + "target": "3429" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Alias for key_value_store_id to match SqlClientMixin expectations." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3442, + "module": "storage_clients._sql._db_models", + "name": "storage_id", + "parsedDocstring": { + "text": "Alias for key_value_store_id to match SqlClientMixin expectations." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 173 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Records table for key-value stores." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 3435, + 3439, + 3437, + 3436, + 3441, + 3440, + 3442, + 3438 + ], + "title": "Properties" + } + ], + "id": 3434, + "module": "storage_clients._sql._db_models", + "name": "KeyValueStoreRecordDb", + "parsedDocstring": { + "text": "Records table for key-value stores." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 144 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "Base", + "target": "3405", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3444, + "module": "storage_clients._sql._db_models", + "name": "__tablename__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 180 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Auto-increment primary key preserving insertion order." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3445, + "module": "storage_clients._sql._db_models", + "name": "item_id", + "parsedDocstring": { + "text": "Auto-increment primary key preserving insertion order." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 182 + } + ], + "type": { + "name": "Mapped", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Foreign key to metadata dataset record." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3446, + "module": "storage_clients._sql._db_models", + "name": "dataset_id", + "parsedDocstring": { + "text": "Foreign key to metadata dataset record." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 185 + } + ], + "type": { + "name": "Mapped", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "JSON serializable item data." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3447, + "module": "storage_clients._sql._db_models", + "name": "data", + "parsedDocstring": { + "text": "JSON serializable item data." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 192 + } + ], + "type": { + "name": "Mapped", + "type": "reference", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "dict", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Any" + } + ] + } + ] + }, + { + "type": "reference", + "name": "dict", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Any" + } + ] + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3448, + "module": "storage_clients._sql._db_models", + "name": "dataset", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 196 + } + ], + "type": { + "name": "Mapped", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "DatasetMetadataDb", + "target": "3413" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Alias for dataset_id to match SqlClientMixin expectations." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3449, + "module": "storage_clients._sql._db_models", + "name": "storage_id", + "parsedDocstring": { + "text": "Alias for dataset_id to match SqlClientMixin expectations." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 198 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Items table for datasets." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 3444, + 3447, + 3448, + 3446, + 3445, + 3449 + ], + "title": "Properties" + } + ], + "id": 3443, + "module": "storage_clients._sql._db_models", + "name": "DatasetItemDb", + "parsedDocstring": { + "text": "Items table for datasets." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 177 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "Base", + "target": "3405", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3451, + "module": "storage_clients._sql._db_models", + "name": "__tablename__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 205 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3452, + "module": "storage_clients._sql._db_models", + "name": "__table_args__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 206 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Unique identifier for the request representing the unique_key." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3453, + "module": "storage_clients._sql._db_models", + "name": "request_id", + "parsedDocstring": { + "text": "Unique identifier for the request representing the unique_key." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 221 + } + ], + "type": { + "name": "Mapped", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Foreign key to metadata request queue record." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3454, + "module": "storage_clients._sql._db_models", + "name": "request_queue_id", + "parsedDocstring": { + "text": "Foreign key to metadata request queue record." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 224 + } + ], + "type": { + "name": "Mapped", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "JSON-serialized Request object." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3455, + "module": "storage_clients._sql._db_models", + "name": "data", + "parsedDocstring": { + "text": "JSON-serialized Request object." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 229 + } + ], + "type": { + "name": "Mapped", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Ordering sequence: negative for forefront, positive for regular." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3456, + "module": "storage_clients._sql._db_models", + "name": "sequence_number", + "parsedDocstring": { + "text": "Ordering sequence: negative for forefront, positive for regular." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 232 + } + ], + "type": { + "name": "Mapped", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Processing status flag." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3457, + "module": "storage_clients._sql._db_models", + "name": "is_handled", + "parsedDocstring": { + "text": "Processing status flag." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 235 + } + ], + "type": { + "name": "Mapped", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Timestamp until which this request is considered blocked for processing by other clients." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3458, + "module": "storage_clients._sql._db_models", + "name": "time_blocked_until", + "parsedDocstring": { + "text": "Timestamp until which this request is considered blocked for processing by other clients." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 238 + } + ], + "type": { + "name": "Mapped", + "type": "reference", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "datetime" + }, + { + "type": "literal", + "value": null + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Identifier of the client that has currently locked this request for processing." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3459, + "module": "storage_clients._sql._db_models", + "name": "client_key", + "parsedDocstring": { + "text": "Identifier of the client that has currently locked this request for processing." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 241 + } + ], + "type": { + "name": "Mapped", + "type": "reference", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3460, + "module": "storage_clients._sql._db_models", + "name": "queue", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 245 + } + ], + "type": { + "name": "Mapped", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "RequestQueueMetadataDb", + "target": "3419" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Alias for request_queue_id to match SqlClientMixin expectations." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3461, + "module": "storage_clients._sql._db_models", + "name": "storage_id", + "parsedDocstring": { + "text": "Alias for request_queue_id to match SqlClientMixin expectations." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 247 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Requests table for request queues." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 3452, + 3451, + 3459, + 3455, + 3457, + 3460, + 3453, + 3454, + 3456, + 3461, + 3458 + ], + "title": "Properties" + } + ], + "id": 3450, + "module": "storage_clients._sql._db_models", + "name": "RequestDb", + "parsedDocstring": { + "text": "Requests table for request queues." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 202 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "Base", + "target": "3405", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3463, + "module": "storage_clients._sql._db_models", + "name": "__tablename__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 254 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Foreign key to metadata request queue record." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3464, + "module": "storage_clients._sql._db_models", + "name": "request_queue_id", + "parsedDocstring": { + "text": "Foreign key to metadata request queue record." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 256 + } + ], + "type": { + "name": "Mapped", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Counter for regular request ordering (positive)." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3465, + "module": "storage_clients._sql._db_models", + "name": "sequence_counter", + "parsedDocstring": { + "text": "Counter for regular request ordering (positive)." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 261 + } + ], + "type": { + "name": "Mapped", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Counter for forefront request ordering (negative)." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3466, + "module": "storage_clients._sql._db_models", + "name": "forefront_sequence_counter", + "parsedDocstring": { + "text": "Counter for forefront request ordering (negative)." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 264 + } + ], + "type": { + "name": "Mapped", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3467, + "module": "storage_clients._sql._db_models", + "name": "queue", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 268 + } + ], + "type": { + "name": "Mapped", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "RequestQueueMetadataDb", + "target": "3419" + } + ] + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "State table for request queues." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 3463, + 3466, + 3467, + 3464, + 3465 + ], + "title": "Properties" + } + ], + "id": 3462, + "module": "storage_clients._sql._db_models", + "name": "RequestQueueStateDb", + "parsedDocstring": { + "text": "State table for request queues." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 251 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "Base", + "target": "3405", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3469, + "module": "storage_clients._sql._db_models", + "name": "__tablename__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 274 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3470, + "module": "storage_clients._sql._db_models", + "name": "version", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 276 + } + ], + "type": { + "name": "Mapped", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Table for storing the database schema version." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 3469, + 3470 + ], + "title": "Properties" + } + ], + "id": 3468, + "module": "storage_clients._sql._db_models", + "name": "VersionDb", + "parsedDocstring": { + "text": "Table for storing the database schema version." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 271 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "Base", + "target": "3405", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Auto-increment primary key for ordering." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3472, + "module": "storage_clients._sql._db_models", + "name": "id", + "parsedDocstring": { + "text": "Auto-increment primary key for ordering." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 282 + } + ], + "type": { + "name": "Mapped", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "New accessed_at timestamp, if being updated." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3473, + "module": "storage_clients._sql._db_models", + "name": "accessed_at", + "parsedDocstring": { + "text": "New accessed_at timestamp, if being updated." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 286 + } + ], + "type": { + "name": "Mapped", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "datetime" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "New modified_at timestamp, if being updated." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3474, + "module": "storage_clients._sql._db_models", + "name": "modified_at", + "parsedDocstring": { + "text": "New modified_at timestamp, if being updated." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 289 + } + ], + "type": { + "name": "Mapped", + "type": "reference", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "datetime" + }, + { + "type": "literal", + "value": null + } + ] + } + ] + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Base model for metadata update buffer tables." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 3473, + 3472, + 3474 + ], + "title": "Properties" + } + ], + "id": 3471, + "module": "storage_clients._sql._db_models", + "name": "MetadataBufferDb", + "parsedDocstring": { + "text": "Base model for metadata update buffer tables." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 279 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedBy": [ + { + "name": "KeyValueStoreMetadataBufferDb", + "target": "3475", + "type": "reference" + }, + { + "name": "DatasetMetadataBufferDb", + "target": "3479", + "type": "reference" + }, + { + "name": "RequestQueueMetadataBufferDb", + "target": "3484", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3476, + "module": "storage_clients._sql._db_models", + "name": "__tablename__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 296 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "ID of the key-value store being updated." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3477, + "module": "storage_clients._sql._db_models", + "name": "key_value_store_id", + "parsedDocstring": { + "text": "ID of the key-value store being updated." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 299 + } + ], + "type": { + "name": "Mapped", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Alias for key_value_store_id to match SqlClientMixin expectations." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3478, + "module": "storage_clients._sql._db_models", + "name": "storage_id", + "parsedDocstring": { + "text": "Alias for key_value_store_id to match SqlClientMixin expectations." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 302 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Auto-increment primary key for ordering." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3947, + "module": "storage_clients._sql._db_models", + "name": "id", + "parsedDocstring": { + "text": "Auto-increment primary key for ordering." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 282 + } + ], + "type": { + "name": "Mapped[int]", + "type": "reference" + }, + "inheritedFrom": { + "name": "MetadataBufferDb.id", + "target": 3472, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "New accessed_at timestamp, if being updated." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3948, + "module": "storage_clients._sql._db_models", + "name": "accessed_at", + "parsedDocstring": { + "text": "New accessed_at timestamp, if being updated." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 286 + } + ], + "type": { + "name": "Mapped[datetime]", + "type": "reference" + }, + "inheritedFrom": { + "name": "MetadataBufferDb.accessed_at", + "target": 3473, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "New modified_at timestamp, if being updated." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3949, + "module": "storage_clients._sql._db_models", + "name": "modified_at", + "parsedDocstring": { + "text": "New modified_at timestamp, if being updated." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 289 + } + ], + "type": { + "name": "Mapped[datetime | None]", + "type": "reference" + }, + "inheritedFrom": { + "name": "MetadataBufferDb.modified_at", + "target": 3474, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Buffer table for deferred key-value store metadata updates to reduce concurrent access issues." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 3476, + 3948, + 3947, + 3477, + 3949, + 3478 + ], + "title": "Properties" + } + ], + "id": 3475, + "module": "storage_clients._sql._db_models", + "name": "KeyValueStoreMetadataBufferDb", + "parsedDocstring": { + "text": "Buffer table for deferred key-value store metadata updates to reduce concurrent access issues." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 293 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "MetadataBufferDb", + "target": "3471", + "type": "reference" + }, + { + "name": "Base", + "target": "3405", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3480, + "module": "storage_clients._sql._db_models", + "name": "__tablename__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 309 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "ID of the dataset being updated." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3481, + "module": "storage_clients._sql._db_models", + "name": "dataset_id", + "parsedDocstring": { + "text": "ID of the dataset being updated." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 312 + } + ], + "type": { + "name": "Mapped", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Delta for dataset item_count." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3482, + "module": "storage_clients._sql._db_models", + "name": "delta_item_count", + "parsedDocstring": { + "text": "Delta for dataset item_count." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 316 + } + ], + "type": { + "name": "Mapped", + "type": "reference", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Alias for dataset_id to match SqlClientMixin expectations." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3483, + "module": "storage_clients._sql._db_models", + "name": "storage_id", + "parsedDocstring": { + "text": "Alias for dataset_id to match SqlClientMixin expectations." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 319 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Auto-increment primary key for ordering." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3950, + "module": "storage_clients._sql._db_models", + "name": "id", + "parsedDocstring": { + "text": "Auto-increment primary key for ordering." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 282 + } + ], + "type": { + "name": "Mapped[int]", + "type": "reference" + }, + "inheritedFrom": { + "name": "MetadataBufferDb.id", + "target": 3472, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "New accessed_at timestamp, if being updated." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3951, + "module": "storage_clients._sql._db_models", + "name": "accessed_at", + "parsedDocstring": { + "text": "New accessed_at timestamp, if being updated." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 286 + } + ], + "type": { + "name": "Mapped[datetime]", + "type": "reference" + }, + "inheritedFrom": { + "name": "MetadataBufferDb.accessed_at", + "target": 3473, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "New modified_at timestamp, if being updated." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3952, + "module": "storage_clients._sql._db_models", + "name": "modified_at", + "parsedDocstring": { + "text": "New modified_at timestamp, if being updated." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 289 + } + ], + "type": { + "name": "Mapped[datetime | None]", + "type": "reference" + }, + "inheritedFrom": { + "name": "MetadataBufferDb.modified_at", + "target": 3474, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Buffer table for deferred dataset metadata updates to reduce concurrent access issues." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 3480, + 3951, + 3481, + 3482, + 3950, + 3952, + 3483 + ], + "title": "Properties" + } + ], + "id": 3479, + "module": "storage_clients._sql._db_models", + "name": "DatasetMetadataBufferDb", + "parsedDocstring": { + "text": "Buffer table for deferred dataset metadata updates to reduce concurrent access issues." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 306 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "MetadataBufferDb", + "target": "3471", + "type": "reference" + }, + { + "name": "Base", + "target": "3405", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3485, + "module": "storage_clients._sql._db_models", + "name": "__tablename__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 326 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3486, + "module": "storage_clients._sql._db_models", + "name": "__table_args__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 328 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "ID of the request queue being updated." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3487, + "module": "storage_clients._sql._db_models", + "name": "request_queue_id", + "parsedDocstring": { + "text": "ID of the request queue being updated." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 331 + } + ], + "type": { + "name": "Mapped", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Identifier of the client making this update." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3488, + "module": "storage_clients._sql._db_models", + "name": "client_id", + "parsedDocstring": { + "text": "Identifier of the client making this update." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 334 + } + ], + "type": { + "name": "Mapped", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Delta for handled_request_count." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3489, + "module": "storage_clients._sql._db_models", + "name": "delta_handled_count", + "parsedDocstring": { + "text": "Delta for handled_request_count." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 338 + } + ], + "type": { + "name": "Mapped", + "type": "reference", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Delta for pending_request_count." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3490, + "module": "storage_clients._sql._db_models", + "name": "delta_pending_count", + "parsedDocstring": { + "text": "Delta for pending_request_count." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 341 + } + ], + "type": { + "name": "Mapped", + "type": "reference", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Delta for total_request_count." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3491, + "module": "storage_clients._sql._db_models", + "name": "delta_total_count", + "parsedDocstring": { + "text": "Delta for total_request_count." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 344 + } + ], + "type": { + "name": "Mapped", + "type": "reference", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Flag indicating that counters need recalculation from actual data." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3492, + "module": "storage_clients._sql._db_models", + "name": "need_recalc", + "parsedDocstring": { + "text": "Flag indicating that counters need recalculation from actual data." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 347 + } + ], + "type": { + "name": "Mapped", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Alias for request_queue_id to match SqlClientMixin expectations." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3493, + "module": "storage_clients._sql._db_models", + "name": "storage_id", + "parsedDocstring": { + "text": "Alias for request_queue_id to match SqlClientMixin expectations." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 350 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Auto-increment primary key for ordering." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3953, + "module": "storage_clients._sql._db_models", + "name": "id", + "parsedDocstring": { + "text": "Auto-increment primary key for ordering." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 282 + } + ], + "type": { + "name": "Mapped[int]", + "type": "reference" + }, + "inheritedFrom": { + "name": "MetadataBufferDb.id", + "target": 3472, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "New accessed_at timestamp, if being updated." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3954, + "module": "storage_clients._sql._db_models", + "name": "accessed_at", + "parsedDocstring": { + "text": "New accessed_at timestamp, if being updated." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 286 + } + ], + "type": { + "name": "Mapped[datetime]", + "type": "reference" + }, + "inheritedFrom": { + "name": "MetadataBufferDb.accessed_at", + "target": 3473, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "New modified_at timestamp, if being updated." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3955, + "module": "storage_clients._sql._db_models", + "name": "modified_at", + "parsedDocstring": { + "text": "New modified_at timestamp, if being updated." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 289 + } + ], + "type": { + "name": "Mapped[datetime | None]", + "type": "reference" + }, + "inheritedFrom": { + "name": "MetadataBufferDb.modified_at", + "target": 3474, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Buffer table for deferred request queue metadata updates to reduce concurrent access issues." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 3486, + 3485, + 3954, + 3488, + 3489, + 3490, + 3491, + 3953, + 3955, + 3492, + 3487, + 3493 + ], + "title": "Properties" + } + ], + "id": 3484, + "module": "storage_clients._sql._db_models", + "name": "RequestQueueMetadataBufferDb", + "parsedDocstring": { + "text": "Buffer table for deferred request queue metadata updates to reduce concurrent access issues." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_db_models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 323 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "MetadataBufferDb", + "target": "3471", + "type": "reference" + }, + { + "name": "Base", + "target": "3405", + "type": "reference" + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3494, + "module": "storage_clients._sql._key_value_store_client", + "name": "logger", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_key_value_store_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 31 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n\nPreferably use the `SqlKeyValueStoreClient.open` class method to create a new instance." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3496, + "module": "storage_clients._sql._key_value_store_client", + "name": "__init__", + "parsedDocstring": { + "text": "Initialize a new instance.\n\nPreferably use the `SqlKeyValueStoreClient.open` class method to create a new instance." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_key_value_store_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 72 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n\nPreferably use the `SqlKeyValueStoreClient.open` class method to create a new instance." + } + ] + }, + "flags": {}, + "id": 3497, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 3498, + "kind": 32768, + "kindString": "Parameter", + "name": "storage_client", + "type": { + "name": "SqlStorageClient", + "type": "reference", + "target": "3591" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 3499, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "SqlClientMixin.__init__", + "target": 3538, + "type": "reference" + } + } + ], + "overwrites": { + "name": "SqlClientMixin.__init__", + "target": 3538, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Open or create a SQL key-value store client.\n\nThis method attempts to open an existing key-value store from the SQL database. If a KVS with the specified\nID or name exists, it loads the metadata from the database. If no existing store is found, a new one\nis created.\n" + } + ] + }, + "decorations": [ + { + "name": "classmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3500, + "module": "storage_clients._sql._key_value_store_client", + "name": "open", + "parsedDocstring": { + "text": "Open or create a SQL key-value store client.\n\nThis method attempts to open an existing key-value store from the SQL database. If a KVS with the specified\nID or name exists, it loads the metadata from the database. If no existing store is found, a new one\nis created.\n", + "args": { + "id": "The ID of the key-value store to open. If provided, searches for existing store by ID.", + "name": "The name of the key-value store for named (global scope) storages.", + "alias": "The alias of the key-value store for unnamed (run scope) storages.", + "storage_client": "The SQL storage client used to access the database.\n" + }, + "returns": "An instance for the opened or created storage client." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_key_value_store_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 85 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "An instance for the opened or created storage client." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Open or create a SQL key-value store client.\n\nThis method attempts to open an existing key-value store from the SQL database. If a KVS with the specified\nID or name exists, it loads the metadata from the database. If no existing store is found, a new one\nis created.\n" + } + ] + }, + "flags": {}, + "id": 3501, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "open", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The ID of the key-value store to open. If provided, searches for existing store by ID." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 3502, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The name of the key-value store for named (global scope) storages." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 3503, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The alias of the key-value store for unnamed (run scope) storages." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 3504, + "kind": 32768, + "kindString": "Parameter", + "name": "alias", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The SQL storage client used to access the database.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 3505, + "kind": 32768, + "kindString": "Parameter", + "name": "storage_client", + "type": { + "name": "SqlStorageClient", + "type": "reference", + "target": "3591" + } + } + ], + "type": { + "name": "Self", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the metadata of the key-value store." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3506, + "module": "storage_clients._base._key_value_store_client", + "name": "get_metadata", + "parsedDocstring": { + "text": "Get the metadata of the key-value store." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_key_value_store_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 121 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the metadata of the key-value store." + } + ] + }, + "flags": {}, + "id": 2844, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_metadata", + "parameters": [], + "type": { + "name": "KeyValueStoreMetadata", + "type": "reference", + "target": "3642" + }, + "overwrites": { + "name": "KeyValueStoreClient.get_metadata", + "target": 2843, + "type": "reference" + } + } + ], + "overwrites": { + "name": "KeyValueStoreClient.get_metadata", + "target": 2843, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Delete this key-value store and all its records from the database.\n\nThis operation is irreversible. Uses CASCADE deletion to remove all related records." + } + ] + }, + "decorations": [ + { + "name": "override" + } + ], + "flags": {}, + "groups": [], + "id": 3508, + "module": "storage_clients._sql._key_value_store_client", + "name": "drop", + "parsedDocstring": { + "text": "Delete this key-value store and all its records from the database.\n\nThis operation is irreversible. Uses CASCADE deletion to remove all related records." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_key_value_store_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 126 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Delete this key-value store and all its records from the database.\n\nThis operation is irreversible. Uses CASCADE deletion to remove all related records." + } + ] + }, + "flags": {}, + "id": 3509, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "drop", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "KeyValueStoreClient.drop", + "target": 2845, + "type": "reference" + } + } + ], + "overwrites": { + "name": "KeyValueStoreClient.drop", + "target": 2845, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Remove all items from this key-value store while keeping the key-value store structure.\n\nRemove all records from key_value_store_records table." + } + ] + }, + "decorations": [ + { + "name": "override" + } + ], + "flags": {}, + "groups": [], + "id": 3510, + "module": "storage_clients._sql._key_value_store_client", + "name": "purge", + "parsedDocstring": { + "text": "Remove all items from this key-value store while keeping the key-value store structure.\n\nRemove all records from key_value_store_records table." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_key_value_store_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 134 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Remove all items from this key-value store while keeping the key-value store structure.\n\nRemove all records from key_value_store_records table." + } + ] + }, + "flags": {}, + "id": 3511, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "purge", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "KeyValueStoreClient.purge", + "target": 2847, + "type": "reference" + } + } + ], + "overwrites": { + "name": "KeyValueStoreClient.purge", + "target": 2847, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Set a value in the key-value store by its key.\n\nThe backend method for the `KeyValueStore.set_value` call." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3512, + "module": "storage_clients._base._key_value_store_client", + "name": "set_value", + "parsedDocstring": { + "text": "Set a value in the key-value store by its key.\n\nThe backend method for the `KeyValueStore.set_value` call." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_key_value_store_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 143 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Set a value in the key-value store by its key.\n\nThe backend method for the `KeyValueStore.set_value` call." + } + ] + }, + "flags": {}, + "id": 2853, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "set_value", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2854, + "kind": 32768, + "kindString": "Parameter", + "name": "key", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2855, + "kind": 32768, + "kindString": "Parameter", + "name": "value", + "type": { + "name": "Any", + "type": "reference" + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2856, + "kind": 32768, + "kindString": "Parameter", + "name": "content_type", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "KeyValueStoreClient.set_value", + "target": 2852, + "type": "reference" + } + } + ], + "overwrites": { + "name": "KeyValueStoreClient.set_value", + "target": 2852, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Retrieve the given record from the key-value store.\n\nThe backend method for the `KeyValueStore.get_value` call." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3517, + "module": "storage_clients._base._key_value_store_client", + "name": "get_value", + "parsedDocstring": { + "text": "Retrieve the given record from the key-value store.\n\nThe backend method for the `KeyValueStore.get_value` call." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_key_value_store_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 184 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Retrieve the given record from the key-value store.\n\nThe backend method for the `KeyValueStore.get_value` call." + } + ] + }, + "flags": {}, + "id": 2850, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_value", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2851, + "kind": 32768, + "kindString": "Parameter", + "name": "key", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "KeyValueStoreRecord | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "KeyValueStoreRecord", + "target": "3655" + }, + { + "type": "literal", + "value": null + } + ] + }, + "overwrites": { + "name": "KeyValueStoreClient.get_value", + "target": 2849, + "type": "reference" + } + } + ], + "overwrites": { + "name": "KeyValueStoreClient.get_value", + "target": 2849, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Delete a value from the key-value store by its key.\n\nThe backend method for the `KeyValueStore.delete_value` call." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3520, + "module": "storage_clients._base._key_value_store_client", + "name": "delete_value", + "parsedDocstring": { + "text": "Delete a value from the key-value store by its key.\n\nThe backend method for the `KeyValueStore.delete_value` call." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_key_value_store_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 230 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Delete a value from the key-value store by its key.\n\nThe backend method for the `KeyValueStore.delete_value` call." + } + ] + }, + "flags": {}, + "id": 2858, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "delete_value", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2859, + "kind": 32768, + "kindString": "Parameter", + "name": "key", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "KeyValueStoreClient.delete_value", + "target": 2857, + "type": "reference" + } + } + ], + "overwrites": { + "name": "KeyValueStoreClient.delete_value", + "target": 2857, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Iterate over all the existing keys in the key-value store.\n\nThe backend method for the `KeyValueStore.iterate_keys` call." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3523, + "module": "storage_clients._base._key_value_store_client", + "name": "iterate_keys", + "parsedDocstring": { + "text": "Iterate over all the existing keys in the key-value store.\n\nThe backend method for the `KeyValueStore.iterate_keys` call." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_key_value_store_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 244 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Iterate over all the existing keys in the key-value store.\n\nThe backend method for the `KeyValueStore.iterate_keys` call." + } + ] + }, + "flags": {}, + "id": 2861, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "iterate_keys", + "parameters": [ + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2862, + "kind": 32768, + "kindString": "Parameter", + "name": "exclusive_start_key", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2863, + "kind": 32768, + "kindString": "Parameter", + "name": "limit", + "type": { + "name": "int | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "AsyncIterator", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "KeyValueStoreRecordMetadata", + "target": "3650" + } + ] + }, + "overwrites": { + "name": "KeyValueStoreClient.iterate_keys", + "target": 2860, + "type": "reference" + } + } + ], + "overwrites": { + "name": "KeyValueStoreClient.iterate_keys", + "target": 2860, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Check if a record with the given key exists in the key-value store.\n\nThe backend method for the `KeyValueStore.record_exists` call.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3527, + "module": "storage_clients._base._key_value_store_client", + "name": "record_exists", + "parsedDocstring": { + "text": "Check if a record with the given key exists in the key-value store.\n\nThe backend method for the `KeyValueStore.record_exists` call.\n", + "args": { + "key": "The key to check for existence.\n" + }, + "returns": "True if a record with the given key exists, False otherwise." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_key_value_store_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 278 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "True if a record with the given key exists, False otherwise." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Check if a record with the given key exists in the key-value store.\n\nThe backend method for the `KeyValueStore.record_exists` call.\n" + } + ] + }, + "flags": {}, + "id": 2868, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "record_exists", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The key to check for existence.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2869, + "kind": 32768, + "kindString": "Parameter", + "name": "key", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "bool", + "type": "reference" + }, + "overwrites": { + "name": "KeyValueStoreClient.record_exists", + "target": 2867, + "type": "reference" + } + } + ], + "overwrites": { + "name": "KeyValueStoreClient.record_exists", + "target": 2867, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the public URL for the given key.\n\nThe backend method for the `KeyValueStore.get_public_url` call." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3530, + "module": "storage_clients._base._key_value_store_client", + "name": "get_public_url", + "parsedDocstring": { + "text": "Get the public URL for the given key.\n\nThe backend method for the `KeyValueStore.get_public_url` call." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_key_value_store_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 291 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the public URL for the given key.\n\nThe backend method for the `KeyValueStore.get_public_url` call." + } + ] + }, + "flags": {}, + "id": 2865, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_public_url", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 2866, + "kind": 32768, + "kindString": "Parameter", + "name": "key", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "str", + "type": "reference" + }, + "overwrites": { + "name": "KeyValueStoreClient.get_public_url", + "target": 2864, + "type": "reference" + } + } + ], + "overwrites": { + "name": "KeyValueStoreClient.get_public_url", + "target": 2864, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a new SQLAlchemy session for this storage." + } + ] + }, + "decorations": [ + { + "name": "asynccontextmanager" + } + ], + "flags": {}, + "groups": [], + "id": 3975, + "module": "storage_clients._sql._client_mixin", + "name": "get_session", + "parsedDocstring": { + "text": "Create a new SQLAlchemy session for this storage." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_client_mixin.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 197 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a new SQLAlchemy session for this storage." + } + ] + }, + "flags": {}, + "id": 3543, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_session", + "parameters": [ + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 3544, + "kind": 32768, + "kindString": "Parameter", + "name": "with_simple_commit", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "AsyncIterator[AsyncSession]", + "type": "reference" + }, + "inheritedFrom": { + "name": "SqlClientMixin.get_session", + "target": 3542, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "SqlClientMixin.get_session", + "target": 3542, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "SQL implementation of the key-value store client.\n\nThis client persists key-value data to a SQL database with transaction support and\nconcurrent access safety. Keys are mapped to rows in database tables with proper indexing\nfor efficient retrieval.\n\nThe key-value store data is stored in SQL database tables following the pattern:\n- `key_value_stores` table: Contains store metadata (id, name, timestamps)\n- `key_value_store_records` table: Contains individual key-value pairs with binary value storage, content type,\nand size information\n- `key_value_store_metadata_buffer` table: Buffers metadata updates for performance optimization\n\nValues are serialized based on their type: JSON objects are stored as formatted JSON,\ntext values as UTF-8 encoded strings, and binary data as-is in the `LargeBinary` column.\nThe implementation automatically handles content type detection and maintains metadata\nabout each record including size and MIME type information.\n\nAll database operations are wrapped in transactions with proper error handling and rollback\nmechanisms. The client supports atomic upsert operations and handles race conditions when\nmultiple clients access the same store using composite primary keys (key_value_store_id, key)." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 3496, + 3520, + 3508, + 3506, + 3530, + 3975, + 3517, + 3523, + 3500, + 3510, + 3527, + 3512 + ], + "title": "Methods" + } + ], + "id": 3495, + "module": "storage_clients._sql._key_value_store_client", + "name": "SqlKeyValueStoreClient", + "parsedDocstring": { + "text": "SQL implementation of the key-value store client.\n\nThis client persists key-value data to a SQL database with transaction support and\nconcurrent access safety. Keys are mapped to rows in database tables with proper indexing\nfor efficient retrieval.\n\nThe key-value store data is stored in SQL database tables following the pattern:\n- `key_value_stores` table: Contains store metadata (id, name, timestamps)\n- `key_value_store_records` table: Contains individual key-value pairs with binary value storage, content type,\nand size information\n- `key_value_store_metadata_buffer` table: Buffers metadata updates for performance optimization\n\nValues are serialized based on their type: JSON objects are stored as formatted JSON,\ntext values as UTF-8 encoded strings, and binary data as-is in the `LargeBinary` column.\nThe implementation automatically handles content type detection and maintains metadata\nabout each record including size and MIME type information.\n\nAll database operations are wrapped in transactions with proper error handling and rollback\nmechanisms. The client supports atomic upsert operations and handles race conditions when\nmultiple clients access the same store using composite primary keys (key_value_store_id, key)." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_key_value_store_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 34 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "SqlClientMixin", + "target": "3537", + "type": "reference" + }, + { + "name": "KeyValueStoreClient", + "target": "2842", + "type": "reference" + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3533, + "module": "storage_clients._sql._client_mixin", + "name": "logger", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_client_mixin.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 42 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3535, + "module": "storage_clients._sql._client_mixin", + "name": "accessed_at", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_client_mixin.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 48 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "datetime" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3536, + "module": "storage_clients._sql._client_mixin", + "name": "modified_at", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_client_mixin.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 49 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "datetime" + } + ] + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Parameters for updating metadata." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 3535, + 3536 + ], + "title": "Properties" + } + ], + "id": 3534, + "module": "storage_clients._sql._client_mixin", + "name": "MetadataUpdateParams", + "parsedDocstring": { + "text": "Parameters for updating metadata." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_client_mixin.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 45 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedBy": [ + { + "name": "_DatasetMetadataUpdateParams", + "target": "3174", + "type": "reference" + }, + { + "name": "_QueueMetadataUpdateParams", + "target": "3295", + "type": "reference" + }, + { + "name": "_DatasetMetadataUpdateParams", + "target": "3174", + "type": "reference" + }, + { + "name": "_QueueMetadataUpdateParams", + "target": "3295", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3538, + "module": "storage_clients._sql._client_mixin", + "name": "__init__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_client_mixin.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 78 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 3539, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 3540, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 3541, + "kind": 32768, + "kindString": "Parameter", + "name": "storage_client", + "type": { + "name": "SqlStorageClient", + "type": "reference", + "target": "3591" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a new SQLAlchemy session for this storage." + } + ] + }, + "decorations": [ + { + "name": "asynccontextmanager" + } + ], + "flags": {}, + "groups": [], + "id": 3542, + "module": "storage_clients._sql._client_mixin", + "name": "get_session", + "parsedDocstring": { + "text": "Create a new SQLAlchemy session for this storage." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_client_mixin.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 197 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a new SQLAlchemy session for this storage." + } + ] + }, + "flags": {}, + "id": 3543, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_session", + "parameters": [ + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 3544, + "kind": 32768, + "kindString": "Parameter", + "name": "with_simple_commit", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "AsyncIterator", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "AsyncSession" + } + ] + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Mixin class for SQL clients.\n\nThis mixin provides common SQL operations and basic methods for SQL storage clients." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 3538, + 3542 + ], + "title": "Methods" + } + ], + "id": 3537, + "module": "storage_clients._sql._client_mixin", + "name": "SqlClientMixin", + "parsedDocstring": { + "text": "Mixin class for SQL clients.\n\nThis mixin provides common SQL operations and basic methods for SQL storage clients." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_client_mixin.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 52 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedBy": [ + { + "name": "SqlDatasetClient", + "target": "3348", + "type": "reference" + }, + { + "name": "SqlKeyValueStoreClient", + "target": "3495", + "type": "reference" + }, + { + "name": "SqlRequestQueueClient", + "target": "3555", + "type": "reference" + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3545, + "module": "storage_clients._sql._request_queue_client", + "name": "logger", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 38 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3547, + "module": "storage_clients._sql._request_queue_client", + "name": "new_handled_request_count", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 44 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3548, + "module": "storage_clients._sql._request_queue_client", + "name": "new_pending_request_count", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 45 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3549, + "module": "storage_clients._sql._request_queue_client", + "name": "new_total_request_count", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 46 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3550, + "module": "storage_clients._sql._request_queue_client", + "name": "delta_handled_request_count", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 47 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3551, + "module": "storage_clients._sql._request_queue_client", + "name": "delta_pending_request_count", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 48 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3552, + "module": "storage_clients._sql._request_queue_client", + "name": "delta_total_request_count", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 49 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3553, + "module": "storage_clients._sql._request_queue_client", + "name": "recalculate", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 50 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3554, + "module": "storage_clients._sql._request_queue_client", + "name": "update_had_multiple_clients", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 51 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3985, + "module": "storage_clients._sql._client_mixin", + "name": "accessed_at", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_client_mixin.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 48 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "datetime" + } + ] + }, + "inheritedFrom": { + "name": "MetadataUpdateParams.accessed_at", + "target": 3535, + "type": "reference" + }, + "overwrites": { + "name": "MetadataUpdateParams.accessed_at", + "target": 3535, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3986, + "module": "storage_clients._sql._client_mixin", + "name": "modified_at", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_client_mixin.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 49 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "datetime" + } + ] + }, + "inheritedFrom": { + "name": "MetadataUpdateParams.modified_at", + "target": 3536, + "type": "reference" + }, + "overwrites": { + "name": "MetadataUpdateParams.modified_at", + "target": 3536, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Parameters for updating queue metadata." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 3985, + 3550, + 3551, + 3552, + 3986, + 3547, + 3548, + 3549, + 3553, + 3554 + ], + "title": "Properties" + } + ], + "id": 3546, + "module": "storage_clients._sql._request_queue_client", + "name": "_QueueMetadataUpdateParams", + "parsedDocstring": { + "text": "Parameters for updating queue metadata." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 41 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "MetadataUpdateParams", + "target": "3162", + "type": "reference" + }, + { + "name": "MetadataUpdateParams", + "target": "3162", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n\nPreferably use the `SqlRequestQueueClient.open` class method to create a new instance." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3556, + "module": "storage_clients._sql._request_queue_client", + "name": "__init__", + "parsedDocstring": { + "text": "Initialize a new instance.\n\nPreferably use the `SqlRequestQueueClient.open` class method to create a new instance." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 103 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n\nPreferably use the `SqlRequestQueueClient.open` class method to create a new instance." + } + ] + }, + "flags": {}, + "id": 3557, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 3558, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 3559, + "kind": 32768, + "kindString": "Parameter", + "name": "storage_client", + "type": { + "name": "SqlStorageClient", + "type": "reference", + "target": "3591" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "SqlClientMixin.__init__", + "target": 3538, + "type": "reference" + } + } + ], + "overwrites": { + "name": "SqlClientMixin.__init__", + "target": 3538, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Open an existing request queue or create a new one.\n\nThis method first tries to find an existing queue by ID or name.\nIf found, it returns a client for that queue. If not found, it creates\na new queue with the specified parameters.\n" + } + ] + }, + "decorations": [ + { + "name": "classmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3560, + "module": "storage_clients._sql._request_queue_client", + "name": "open", + "parsedDocstring": { + "text": "Open an existing request queue or create a new one.\n\nThis method first tries to find an existing queue by ID or name.\nIf found, it returns a client for that queue. If not found, it creates\na new queue with the specified parameters.\n", + "args": { + "id": "The ID of the request queue to open. Takes precedence over name.", + "name": "The name of the request queue for named (global scope) storages.", + "alias": "The alias of the request queue for unnamed (run scope) storages.", + "storage_client": "The SQL storage client used to access the database.\n" + }, + "returns": "An instance for the opened or created request queue." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 125 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "An instance for the opened or created request queue." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Open an existing request queue or create a new one.\n\nThis method first tries to find an existing queue by ID or name.\nIf found, it returns a client for that queue. If not found, it creates\na new queue with the specified parameters.\n" + } + ] + }, + "flags": {}, + "id": 3561, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "open", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The ID of the request queue to open. Takes precedence over name." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 3562, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The name of the request queue for named (global scope) storages." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 3563, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The alias of the request queue for unnamed (run scope) storages." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 3564, + "kind": 32768, + "kindString": "Parameter", + "name": "alias", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The SQL storage client used to access the database.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 3565, + "kind": 32768, + "kindString": "Parameter", + "name": "storage_client", + "type": { + "name": "SqlStorageClient", + "type": "reference", + "target": "3591" + } + } + ], + "type": { + "name": "Self", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the metadata of the request queue." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3566, + "module": "storage_clients._base._request_queue_client", + "name": "get_metadata", + "parsedDocstring": { + "text": "Get the metadata of the request queue." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 166 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the metadata of the request queue." + } + ] + }, + "flags": {}, + "id": 2761, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_metadata", + "parameters": [], + "type": { + "name": "RequestQueueMetadata", + "type": "reference", + "target": "3644" + }, + "overwrites": { + "name": "RequestQueueClient.get_metadata", + "target": 2760, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RequestQueueClient.get_metadata", + "target": 2760, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Delete this request queue and all its records from the database.\n\nThis operation is irreversible. Uses CASCADE deletion to remove all related records." + } + ] + }, + "decorations": [ + { + "name": "override" + } + ], + "flags": {}, + "groups": [], + "id": 3568, + "module": "storage_clients._sql._request_queue_client", + "name": "drop", + "parsedDocstring": { + "text": "Delete this request queue and all its records from the database.\n\nThis operation is irreversible. Uses CASCADE deletion to remove all related records." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 173 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Delete this request queue and all its records from the database.\n\nThis operation is irreversible. Uses CASCADE deletion to remove all related records." + } + ] + }, + "flags": {}, + "id": 3569, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "drop", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "RequestQueueClient.drop", + "target": 2762, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RequestQueueClient.drop", + "target": 2762, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Remove all items from this dataset while keeping the dataset structure.\n\nResets pending_request_count and handled_request_count to 0 and deletes all records from request_queue_records\ntable." + } + ] + }, + "decorations": [ + { + "name": "override" + } + ], + "flags": {}, + "groups": [], + "id": 3570, + "module": "storage_clients._sql._request_queue_client", + "name": "purge", + "parsedDocstring": { + "text": "Remove all items from this dataset while keeping the dataset structure.\n\nResets pending_request_count and handled_request_count to 0 and deletes all records from request_queue_records\ntable." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 183 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Remove all items from this dataset while keeping the dataset structure.\n\nResets pending_request_count and handled_request_count to 0 and deletes all records from request_queue_records\ntable." + } + ] + }, + "flags": {}, + "id": 3571, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "purge", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "RequestQueueClient.purge", + "target": 2764, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RequestQueueClient.purge", + "target": 2764, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Add batch of requests to the queue.\n\nThis method adds a batch of requests to the queue. Each request is processed based on its uniqueness\n(determined by `unique_key`). Duplicates will be identified but not re-added to the queue.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3572, + "module": "storage_clients._base._request_queue_client", + "name": "add_batch_of_requests", + "parsedDocstring": { + "text": "Add batch of requests to the queue.\n\nThis method adds a batch of requests to the queue. Each request is processed based on its uniqueness\n(determined by `unique_key`). Duplicates will be identified but not re-added to the queue.\n", + "args": { + "requests": "The collection of requests to add to the queue.", + "forefront": "Whether to put the added requests at the beginning (True) or the end (False) of the queue.\nWhen True, the requests will be processed sooner than previously added requests.", + "batch_size": "The maximum number of requests to add in a single batch.", + "wait_time_between_batches": "The time to wait between adding batches of requests.", + "wait_for_all_requests_to_be_added": "If True, the method will wait until all requests are added\nto the queue before returning.", + "wait_for_all_requests_to_be_added_timeout": "The maximum time to wait for all requests to be added.\n" + }, + "returns": "A response object containing information about which requests were successfully\nprocessed and which failed (if any)." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 204 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "A response object containing information about which requests were successfully\nprocessed and which failed (if any)." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Add batch of requests to the queue.\n\nThis method adds a batch of requests to the queue. Each request is processed based on its uniqueness\n(determined by `unique_key`). Duplicates will be identified but not re-added to the queue.\n" + } + ] + }, + "flags": {}, + "id": 2767, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "add_batch_of_requests", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The collection of requests to add to the queue." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2768, + "kind": 32768, + "kindString": "Parameter", + "name": "requests", + "type": { + "name": "Sequence", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Request", + "target": "150" + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Whether to put the added requests at the beginning (True) or the end (False) of the queue.\nWhen True, the requests will be processed sooner than previously added requests." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2769, + "kind": 32768, + "kindString": "Parameter", + "name": "forefront", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "AddRequestsResponse", + "type": "reference", + "target": "3676" + }, + "overwrites": { + "name": "RequestQueueClient.add_batch_of_requests", + "target": 2766, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RequestQueueClient.add_batch_of_requests", + "target": 2766, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Retrieve a request from the queue.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3576, + "module": "storage_clients._base._request_queue_client", + "name": "get_request", + "parsedDocstring": { + "text": "Retrieve a request from the queue.\n", + "args": { + "unique_key": "Unique key of the request to retrieve.\n" + }, + "returns": "The retrieved request, or None, if it did not exist." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 395 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "The retrieved request, or None, if it did not exist." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Retrieve a request from the queue.\n" + } + ] + }, + "flags": {}, + "id": 2771, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_request", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Unique key of the request to retrieve.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2772, + "kind": 32768, + "kindString": "Parameter", + "name": "unique_key", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "Request | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Request", + "target": "150" + }, + { + "type": "literal", + "value": null + } + ] + }, + "overwrites": { + "name": "RequestQueueClient.get_request", + "target": 2770, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RequestQueueClient.get_request", + "target": 2770, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the next request in the queue to be processed.\n\nOnce you successfully finish processing of the request, you need to call `RequestQueue.mark_request_as_handled`\nto mark the request as handled in the queue. If there was some error in processing the request, call\n`RequestQueue.reclaim_request` instead, so that the queue will give the request to some other consumer\nin another call to the `fetch_next_request` method.\n\nNote that the `None` return value does not mean the queue processing finished, it means there are currently\nno pending requests. To check whether all requests in queue were finished, use `RequestQueue.is_finished`\ninstead.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3579, + "module": "storage_clients._base._request_queue_client", + "name": "fetch_next_request", + "parsedDocstring": { + "text": "Return the next request in the queue to be processed.\n\nOnce you successfully finish processing of the request, you need to call `RequestQueue.mark_request_as_handled`\nto mark the request as handled in the queue. If there was some error in processing the request, call\n`RequestQueue.reclaim_request` instead, so that the queue will give the request to some other consumer\nin another call to the `fetch_next_request` method.\n\nNote that the `None` return value does not mean the queue processing finished, it means there are currently\nno pending requests. To check whether all requests in queue were finished, use `RequestQueue.is_finished`\ninstead.\n", + "returns": "The request or `None` if there are no more pending requests." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 414 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "The request or `None` if there are no more pending requests." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Return the next request in the queue to be processed.\n\nOnce you successfully finish processing of the request, you need to call `RequestQueue.mark_request_as_handled`\nto mark the request as handled in the queue. If there was some error in processing the request, call\n`RequestQueue.reclaim_request` instead, so that the queue will give the request to some other consumer\nin another call to the `fetch_next_request` method.\n\nNote that the `None` return value does not mean the queue processing finished, it means there are currently\nno pending requests. To check whether all requests in queue were finished, use `RequestQueue.is_finished`\ninstead.\n" + } + ] + }, + "flags": {}, + "id": 2774, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "fetch_next_request", + "parameters": [], + "type": { + "name": "Request | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Request", + "target": "150" + }, + { + "type": "literal", + "value": null + } + ] + }, + "overwrites": { + "name": "RequestQueueClient.fetch_next_request", + "target": 2773, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RequestQueueClient.fetch_next_request", + "target": 2773, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Mark a request as handled after successful processing.\n\nHandled requests will never again be returned by the `RequestQueue.fetch_next_request` method.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3581, + "module": "storage_clients._base._request_queue_client", + "name": "mark_request_as_handled", + "parsedDocstring": { + "text": "Mark a request as handled after successful processing.\n\nHandled requests will never again be returned by the `RequestQueue.fetch_next_request` method.\n", + "args": { + "request": "The request to mark as handled.\n" + }, + "returns": "Information about the queue operation. `None` if the given request was not in progress." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 496 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "Information about the queue operation. `None` if the given request was not in progress." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Mark a request as handled after successful processing.\n\nHandled requests will never again be returned by the `RequestQueue.fetch_next_request` method.\n" + } + ] + }, + "flags": {}, + "id": 2776, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "mark_request_as_handled", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The request to mark as handled.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2777, + "kind": 32768, + "kindString": "Parameter", + "name": "request", + "type": { + "name": "Request", + "type": "reference", + "target": "150" + } + } + ], + "type": { + "name": "ProcessedRequest | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ProcessedRequest", + "target": "3665" + }, + { + "type": "literal", + "value": null + } + ] + }, + "overwrites": { + "name": "RequestQueueClient.mark_request_as_handled", + "target": 2775, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RequestQueueClient.mark_request_as_handled", + "target": 2775, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Reclaim a failed request back to the queue.\n\nThe request will be returned for processing later again by another call to `RequestQueue.fetch_next_request`.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3584, + "module": "storage_clients._base._request_queue_client", + "name": "reclaim_request", + "parsedDocstring": { + "text": "Reclaim a failed request back to the queue.\n\nThe request will be returned for processing later again by another call to `RequestQueue.fetch_next_request`.\n", + "args": { + "request": "The request to return to the queue.", + "forefront": "Whether to add the request to the head or the end of the queue.\n" + }, + "returns": "Information about the queue operation. `None` if the given request was not in progress." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 527 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "Information about the queue operation. `None` if the given request was not in progress." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Reclaim a failed request back to the queue.\n\nThe request will be returned for processing later again by another call to `RequestQueue.fetch_next_request`.\n" + } + ] + }, + "flags": {}, + "id": 2779, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "reclaim_request", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The request to return to the queue." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2780, + "kind": 32768, + "kindString": "Parameter", + "name": "request", + "type": { + "name": "Request", + "type": "reference", + "target": "150" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Whether to add the request to the head or the end of the queue.\n" + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2781, + "kind": 32768, + "kindString": "Parameter", + "name": "forefront", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "ProcessedRequest | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ProcessedRequest", + "target": "3665" + }, + { + "type": "literal", + "value": null + } + ] + }, + "overwrites": { + "name": "RequestQueueClient.reclaim_request", + "target": 2778, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RequestQueueClient.reclaim_request", + "target": 2778, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Check if the request queue is empty.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3588, + "module": "storage_clients._base._request_queue_client", + "name": "is_empty", + "parsedDocstring": { + "text": "Check if the request queue is empty.\n", + "returns": "True if the request queue is empty, False otherwise." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 584 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "True if the request queue is empty, False otherwise." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Check if the request queue is empty.\n" + } + ] + }, + "flags": {}, + "id": 2783, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "is_empty", + "parameters": [], + "type": { + "name": "bool", + "type": "reference" + }, + "overwrites": { + "name": "RequestQueueClient.is_empty", + "target": 2782, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RequestQueueClient.is_empty", + "target": 2782, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a new SQLAlchemy session for this storage." + } + ] + }, + "decorations": [ + { + "name": "asynccontextmanager" + } + ], + "flags": {}, + "groups": [], + "id": 3976, + "module": "storage_clients._sql._client_mixin", + "name": "get_session", + "parsedDocstring": { + "text": "Create a new SQLAlchemy session for this storage." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_client_mixin.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 197 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a new SQLAlchemy session for this storage." + } + ] + }, + "flags": {}, + "id": 3543, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_session", + "parameters": [ + { + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 3544, + "kind": 32768, + "kindString": "Parameter", + "name": "with_simple_commit", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "AsyncIterator[AsyncSession]", + "type": "reference" + }, + "inheritedFrom": { + "name": "SqlClientMixin.get_session", + "target": 3542, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "SqlClientMixin.get_session", + "target": 3542, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "SQL implementation of the request queue client.\n\nThis client persists requests to a SQL database with transaction handling and\nconcurrent access safety. Requests are stored with sequence-based ordering and\nefficient querying capabilities.\n\nThe implementation uses negative sequence numbers for forefront (high-priority) requests\nand positive sequence numbers for regular requests, allowing for efficient single-query\nordering. A cache mechanism reduces database queries.\n\nThe request queue data is stored in SQL database tables following the pattern:\n- `request_queues` table: Contains queue metadata (id, name, timestamps, request counts, multi-client flag)\n- `request_queue_records` table: Contains individual requests with JSON data, unique keys for deduplication,\nsequence numbers for ordering, and processing status flags\n- `request_queue_state` table: Maintains counters for sequence numbers to ensure proper ordering of requests.\n- `request_queue_metadata_buffer` table: Buffers metadata updates for performance optimization\n\nRequests are serialized to JSON for storage and maintain proper ordering through sequence\nnumbers. The implementation provides concurrent access safety through transaction\nhandling, locking mechanisms, and optimized database indexes for efficient querying." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 3556, + 3572, + 3568, + 3579, + 3566, + 3576, + 3976, + 3588, + 3581, + 3560, + 3570, + 3584 + ], + "title": "Methods" + } + ], + "id": 3555, + "module": "storage_clients._sql._request_queue_client", + "name": "SqlRequestQueueClient", + "parsedDocstring": { + "text": "SQL implementation of the request queue client.\n\nThis client persists requests to a SQL database with transaction handling and\nconcurrent access safety. Requests are stored with sequence-based ordering and\nefficient querying capabilities.\n\nThe implementation uses negative sequence numbers for forefront (high-priority) requests\nand positive sequence numbers for regular requests, allowing for efficient single-query\nordering. A cache mechanism reduces database queries.\n\nThe request queue data is stored in SQL database tables following the pattern:\n- `request_queues` table: Contains queue metadata (id, name, timestamps, request counts, multi-client flag)\n- `request_queue_records` table: Contains individual requests with JSON data, unique keys for deduplication,\nsequence numbers for ordering, and processing status flags\n- `request_queue_state` table: Maintains counters for sequence numbers to ensure proper ordering of requests.\n- `request_queue_metadata_buffer` table: Buffers metadata updates for performance optimization\n\nRequests are serialized to JSON for storage and maintain proper ordering through sequence\nnumbers. The implementation provides concurrent access safety through transaction\nhandling, locking mechanisms, and optimized database indexes for efficient querying." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_request_queue_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 54 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "SqlClientMixin", + "target": "3537", + "type": "reference" + }, + { + "name": "RequestQueueClient", + "target": "2759", + "type": "reference" + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3590, + "module": "storage_clients._sql._storage_client", + "name": "logger", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_storage_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 28 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize the SQL storage client.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3592, + "module": "storage_clients._sql._storage_client", + "name": "__init__", + "parsedDocstring": { + "text": "Initialize the SQL storage client.\n", + "args": { + "connection_string": "Database connection string (e.g., \"sqlite+aiosqlite:///crawlee.db\").\nIf not provided, defaults to SQLite database in the storage directory.", + "engine": "Pre-configured AsyncEngine instance. If provided, connection_string is ignored." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_storage_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 54 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize the SQL storage client.\n" + } + ] + }, + "flags": {}, + "id": 3593, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Database connection string (e.g., \"sqlite+aiosqlite:///crawlee.db\").\nIf not provided, defaults to SQLite database in the storage directory." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 3594, + "kind": 32768, + "kindString": "Parameter", + "name": "connection_string", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Pre-configured AsyncEngine instance. If provided, connection_string is ignored." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 3595, + "kind": 32768, + "kindString": "Parameter", + "name": "engine", + "type": { + "name": "AsyncEngine | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "AsyncEngine" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Async context manager entry." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3596, + "module": "storage_clients._sql._storage_client", + "name": "__aenter__", + "parsedDocstring": { + "text": "Async context manager entry." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_storage_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 86 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Async context manager entry." + } + ] + }, + "flags": {}, + "id": 3597, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "__aenter__", + "parameters": [], + "type": { + "name": "SqlStorageClient", + "type": "reference", + "target": "3591" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Async context manager exit." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3598, + "module": "storage_clients._sql._storage_client", + "name": "__aexit__", + "parsedDocstring": { + "text": "Async context manager exit." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_storage_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 90 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Async context manager exit." + } + ] + }, + "flags": {}, + "id": 3599, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "__aexit__", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 3600, + "kind": 32768, + "kindString": "Parameter", + "name": "exc_type", + "type": { + "name": "type[BaseException] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "type", + "typeArguments": [ + { + "type": "reference", + "name": "BaseException" + } + ], + "target": "981" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 3601, + "kind": 32768, + "kindString": "Parameter", + "name": "exc_value", + "type": { + "name": "BaseException | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "BaseException" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 3602, + "kind": 32768, + "kindString": "Parameter", + "name": "exc_traceback", + "type": { + "name": "TracebackType | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "TracebackType" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the SQLAlchemy AsyncEngine instance." + } + ] + }, + "decorations": [ + { + "name": "property" + } + ], + "flags": {}, + "groups": [], + "id": 3603, + "module": "storage_clients._sql._storage_client", + "name": "engine", + "parsedDocstring": { + "text": "Get the SQLAlchemy AsyncEngine instance." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_storage_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 100 + } + ], + "type": { + "name": "AsyncEngine", + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the database dialect name." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3604, + "module": "storage_clients._sql._storage_client", + "name": "get_dialect_name", + "parsedDocstring": { + "text": "Get the database dialect name." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_storage_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 106 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the database dialect name." + } + ] + }, + "flags": {}, + "id": 3605, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "get_dialect_name", + "parameters": [], + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize the database schema.\n\nThis method creates all necessary tables if they don't exist.\nShould be called before using the storage client." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3606, + "module": "storage_clients._sql._storage_client", + "name": "initialize", + "parsedDocstring": { + "text": "Initialize the database schema.\n\nThis method creates all necessary tables if they don't exist.\nShould be called before using the storage client." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_storage_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 110 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize the database schema.\n\nThis method creates all necessary tables if they don't exist.\nShould be called before using the storage client." + } + ] + }, + "flags": {}, + "id": 3607, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "initialize", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 3608, + "kind": 32768, + "kindString": "Parameter", + "name": "configuration", + "type": { + "name": "Configuration", + "type": "reference", + "target": "235" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Close the database connection pool." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3609, + "module": "storage_clients._sql._storage_client", + "name": "close", + "parsedDocstring": { + "text": "Close the database connection pool." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_storage_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 161 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Close the database connection pool." + } + ] + }, + "flags": {}, + "id": 3610, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "close", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a new database session.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3611, + "module": "storage_clients._sql._storage_client", + "name": "create_session", + "parsedDocstring": { + "text": "Create a new database session.\n", + "returns": "A new AsyncSession instance." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_storage_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 167 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "A new AsyncSession instance." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Create a new database session.\n" + } + ] + }, + "flags": {}, + "id": 3612, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "create_session", + "parameters": [], + "type": { + "name": "AsyncSession", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a dataset client." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3613, + "module": "storage_clients._base._storage_client", + "name": "create_dataset_client", + "parsedDocstring": { + "text": "Create a dataset client." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_storage_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 178 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a dataset client." + } + ] + }, + "flags": {}, + "id": 2789, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "create_dataset_client", + "parameters": [ + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2790, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2791, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2792, + "kind": 32768, + "kindString": "Parameter", + "name": "alias", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2793, + "kind": 32768, + "kindString": "Parameter", + "name": "configuration", + "type": { + "name": "Configuration | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Configuration", + "target": "235" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "DatasetClient", + "type": "reference", + "target": "2808" + }, + "overwrites": { + "name": "StorageClient.create_dataset_client", + "target": 2788, + "type": "reference" + } + } + ], + "overwrites": { + "name": "StorageClient.create_dataset_client", + "target": 2788, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a key-value store client." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3619, + "module": "storage_clients._base._storage_client", + "name": "create_kvs_client", + "parsedDocstring": { + "text": "Create a key-value store client." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_storage_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 200 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a key-value store client." + } + ] + }, + "flags": {}, + "id": 2795, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "create_kvs_client", + "parameters": [ + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2796, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2797, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2798, + "kind": 32768, + "kindString": "Parameter", + "name": "alias", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2799, + "kind": 32768, + "kindString": "Parameter", + "name": "configuration", + "type": { + "name": "Configuration | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Configuration", + "target": "235" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "KeyValueStoreClient", + "type": "reference", + "target": "2842" + }, + "overwrites": { + "name": "StorageClient.create_kvs_client", + "target": 2794, + "type": "reference" + } + } + ], + "overwrites": { + "name": "StorageClient.create_kvs_client", + "target": 2794, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a request queue client." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3625, + "module": "storage_clients._base._storage_client", + "name": "create_rq_client", + "parsedDocstring": { + "text": "Create a request queue client." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_storage_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 222 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Create a request queue client." + } + ] + }, + "flags": {}, + "id": 2801, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "create_rq_client", + "parameters": [ + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2802, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2803, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2804, + "kind": 32768, + "kindString": "Parameter", + "name": "alias", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2805, + "kind": 32768, + "kindString": "Parameter", + "name": "configuration", + "type": { + "name": "Configuration | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Configuration", + "target": "235" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "RequestQueueClient", + "type": "reference", + "target": "2759" + }, + "overwrites": { + "name": "StorageClient.create_rq_client", + "target": 2800, + "type": "reference" + } + } + ], + "overwrites": { + "name": "StorageClient.create_rq_client", + "target": 2800, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return a cache key that can differentiate between different storages of this and other clients.\n\nCan be based on configuration or on the client itself. By default, returns a module and name of the client\nclass." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3992, + "module": "storage_clients._base._storage_client", + "name": "get_storage_client_cache_key", + "parsedDocstring": { + "text": "Return a cache key that can differentiate between different storages of this and other clients.\n\nCan be based on configuration or on the client itself. By default, returns a module and name of the client\nclass." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_base/_storage_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 33 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return a cache key that can differentiate between different storages of this and other clients.\n\nCan be based on configuration or on the client itself. By default, returns a module and name of the client\nclass." + } + ] + }, + "flags": {}, + "id": 2786, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "get_storage_client_cache_key", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2787, + "kind": 32768, + "kindString": "Parameter", + "name": "configuration", + "type": { + "name": "Configuration", + "type": "reference", + "target": "235" + } + } + ], + "type": { + "name": "Hashable", + "type": "reference" + }, + "inheritedFrom": { + "name": "StorageClient.get_storage_client_cache_key", + "target": 2785, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "StorageClient.get_storage_client_cache_key", + "target": 2785, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return statistics about rate limit errors encountered by the HTTP client in storage client." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3993, + "module": "storage_clients._base._storage_client", + "name": "get_rate_limit_errors", + "parsedDocstring": { + "text": "Return statistics about rate limit errors encountered by the HTTP client in storage client." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_base/_storage_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 74 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return statistics about rate limit errors encountered by the HTTP client in storage client." + } + ] + }, + "flags": {}, + "id": 2807, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "get_rate_limit_errors", + "parameters": [], + "type": { + "name": "dict[int, int]", + "type": "reference" + }, + "inheritedFrom": { + "name": "StorageClient.get_rate_limit_errors", + "target": 2806, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "StorageClient.get_rate_limit_errors", + "target": 2806, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "SQL implementation of the storage client.\n\nThis storage client provides access to datasets, key-value stores, and request queues that persist data\nto a SQL database using SQLAlchemy 2+. Each storage type uses two tables: one for metadata and one for\nrecords.\n\nThe client accepts either a database connection string or a pre-configured AsyncEngine. If neither is\nprovided, it creates a default SQLite database 'crawlee.db' in the storage directory.\n\nDatabase schema is automatically created during initialization. SQLite databases receive performance\noptimizations including WAL mode and increased cache size.\n\n\n:::warning Warning\nThis is an experimental feature. The behavior and interface may change in future versions.\n:::" + } + ] + }, + "decorations": [ + { + "args": "('Storage clients')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 3596, + 3598, + 3592, + 3609, + 3613, + 3619, + 3625, + 3611, + 3604, + 3993, + 3992, + 3606 + ], + "title": "Methods" + }, + { + "children": [ + 3603 + ], + "title": "Properties" + } + ], + "id": 3591, + "module": "storage_clients._sql._storage_client", + "name": "SqlStorageClient", + "parsedDocstring": { + "text": "SQL implementation of the storage client.\n\nThis storage client provides access to datasets, key-value stores, and request queues that persist data\nto a SQL database using SQLAlchemy 2+. Each storage type uses two tables: one for metadata and one for\nrecords.\n\nThe client accepts either a database connection string or a pre-configured AsyncEngine. If neither is\nprovided, it creates a default SQLite database 'crawlee.db' in the storage directory.\n\nDatabase schema is automatically created during initialization. SQLite databases receive performance\noptimizations including WAL mode and increased cache size.\n\n\n:::warning Warning\nThis is an experimental feature. The behavior and interface may change in future versions.\n:::" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/_sql/_storage_client.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 32 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "StorageClient", + "target": "2784", + "type": "reference" + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3631, + "module": "storage_clients.models", + "name": "KvsValueType", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 13 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3633, + "module": "storage_clients.models", + "name": "model_config", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 23 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The unique identifier of the storage." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3634, + "module": "storage_clients.models", + "name": "id", + "parsedDocstring": { + "text": "The unique identifier of the storage." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 25 + } + ], + "type": { + "name": "str", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The name of the storage." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3635, + "module": "storage_clients.models", + "name": "name", + "parsedDocstring": { + "text": "The name of the storage." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 28 + } + ], + "type": { + "name": "Annotated[str | None, Field(alias='name', default=None)]", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The timestamp when the storage was last accessed." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3636, + "module": "storage_clients.models", + "name": "accessed_at", + "parsedDocstring": { + "text": "The timestamp when the storage was last accessed." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 31 + } + ], + "type": { + "name": "datetime", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The timestamp when the storage was created." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3637, + "module": "storage_clients.models", + "name": "created_at", + "parsedDocstring": { + "text": "The timestamp when the storage was created." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 34 + } + ], + "type": { + "name": "datetime", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The timestamp when the storage was last modified." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3638, + "module": "storage_clients.models", + "name": "modified_at", + "parsedDocstring": { + "text": "The timestamp when the storage was last modified." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 37 + } + ], + "type": { + "name": "datetime", + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Represents the base model for storage metadata.\n\nIt contains common fields shared across all specific storage types." + } + ] + }, + "decorations": [ + { + "args": "('Storage data')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 3636, + 3637, + 3634, + 3633, + 3638, + 3635 + ], + "title": "Properties" + } + ], + "id": 3632, + "module": "storage_clients.models", + "name": "StorageMetadata", + "parsedDocstring": { + "text": "Represents the base model for storage metadata.\n\nIt contains common fields shared across all specific storage types." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 17 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedBy": [ + { + "name": "DatasetMetadata", + "target": "3639", + "type": "reference" + }, + { + "name": "KeyValueStoreMetadata", + "target": "3642", + "type": "reference" + }, + { + "name": "RequestQueueMetadata", + "target": "3644", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3640, + "module": "storage_clients.models", + "name": "model_config", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 45 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "overwrites": { + "name": "StorageMetadata.model_config", + "target": 3633, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The number of items in the dataset." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3641, + "module": "storage_clients.models", + "name": "item_count", + "parsedDocstring": { + "text": "The number of items in the dataset." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 47 + } + ], + "type": { + "name": "int", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The unique identifier of the storage." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3932, + "module": "storage_clients.models", + "name": "id", + "parsedDocstring": { + "text": "The unique identifier of the storage." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 25 + } + ], + "type": { + "name": "Annotated[str, Field(alias='id')]", + "type": "reference" + }, + "inheritedFrom": { + "name": "StorageMetadata.id", + "target": 3634, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The name of the storage." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3933, + "module": "storage_clients.models", + "name": "name", + "parsedDocstring": { + "text": "The name of the storage." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 28 + } + ], + "type": { + "name": "Annotated[str | None, Field(alias='name', default=None)]", + "type": "reference" + }, + "inheritedFrom": { + "name": "StorageMetadata.name", + "target": 3635, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The timestamp when the storage was last accessed." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3934, + "module": "storage_clients.models", + "name": "accessed_at", + "parsedDocstring": { + "text": "The timestamp when the storage was last accessed." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 31 + } + ], + "type": { + "name": "Annotated[datetime, Field(alias='accessedAt')]", + "type": "reference" + }, + "inheritedFrom": { + "name": "StorageMetadata.accessed_at", + "target": 3636, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The timestamp when the storage was created." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3935, + "module": "storage_clients.models", + "name": "created_at", + "parsedDocstring": { + "text": "The timestamp when the storage was created." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 34 + } + ], + "type": { + "name": "Annotated[datetime, Field(alias='createdAt')]", + "type": "reference" + }, + "inheritedFrom": { + "name": "StorageMetadata.created_at", + "target": 3637, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The timestamp when the storage was last modified." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3936, + "module": "storage_clients.models", + "name": "modified_at", + "parsedDocstring": { + "text": "The timestamp when the storage was last modified." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 37 + } + ], + "type": { + "name": "Annotated[datetime, Field(alias='modifiedAt')]", + "type": "reference" + }, + "inheritedFrom": { + "name": "StorageMetadata.modified_at", + "target": 3638, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Model for a dataset metadata." + } + ] + }, + "decorations": [ + { + "args": "('Storage data')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 3934, + 3935, + 3932, + 3641, + 3640, + 3936, + 3933 + ], + "title": "Properties" + } + ], + "id": 3639, + "module": "storage_clients.models", + "name": "DatasetMetadata", + "parsedDocstring": { + "text": "Model for a dataset metadata." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 42 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "StorageMetadata", + "target": "3632", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3643, + "module": "storage_clients.models", + "name": "model_config", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 55 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "overwrites": { + "name": "StorageMetadata.model_config", + "target": 3633, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The unique identifier of the storage." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3937, + "module": "storage_clients.models", + "name": "id", + "parsedDocstring": { + "text": "The unique identifier of the storage." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 25 + } + ], + "type": { + "name": "Annotated[str, Field(alias='id')]", + "type": "reference" + }, + "inheritedFrom": { + "name": "StorageMetadata.id", + "target": 3634, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The name of the storage." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3938, + "module": "storage_clients.models", + "name": "name", + "parsedDocstring": { + "text": "The name of the storage." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 28 + } + ], + "type": { + "name": "Annotated[str | None, Field(alias='name', default=None)]", + "type": "reference" + }, + "inheritedFrom": { + "name": "StorageMetadata.name", + "target": 3635, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The timestamp when the storage was last accessed." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3939, + "module": "storage_clients.models", + "name": "accessed_at", + "parsedDocstring": { + "text": "The timestamp when the storage was last accessed." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 31 + } + ], + "type": { + "name": "Annotated[datetime, Field(alias='accessedAt')]", + "type": "reference" + }, + "inheritedFrom": { + "name": "StorageMetadata.accessed_at", + "target": 3636, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The timestamp when the storage was created." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3940, + "module": "storage_clients.models", + "name": "created_at", + "parsedDocstring": { + "text": "The timestamp when the storage was created." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 34 + } + ], + "type": { + "name": "Annotated[datetime, Field(alias='createdAt')]", + "type": "reference" + }, + "inheritedFrom": { + "name": "StorageMetadata.created_at", + "target": 3637, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The timestamp when the storage was last modified." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3941, + "module": "storage_clients.models", + "name": "modified_at", + "parsedDocstring": { + "text": "The timestamp when the storage was last modified." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 37 + } + ], + "type": { + "name": "Annotated[datetime, Field(alias='modifiedAt')]", + "type": "reference" + }, + "inheritedFrom": { + "name": "StorageMetadata.modified_at", + "target": 3638, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Model for a key-value store metadata." + } + ] + }, + "decorations": [ + { + "args": "('Storage data')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 3939, + 3940, + 3937, + 3643, + 3941, + 3938 + ], + "title": "Properties" + } + ], + "id": 3642, + "module": "storage_clients.models", + "name": "KeyValueStoreMetadata", + "parsedDocstring": { + "text": "Model for a key-value store metadata." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 52 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "StorageMetadata", + "target": "3632", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3645, + "module": "storage_clients.models", + "name": "model_config", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 62 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "overwrites": { + "name": "StorageMetadata.model_config", + "target": 3633, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Indicates whether the queue has been accessed by multiple clients (consumers)." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3646, + "module": "storage_clients.models", + "name": "had_multiple_clients", + "parsedDocstring": { + "text": "Indicates whether the queue has been accessed by multiple clients (consumers)." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 64 + } + ], + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The number of requests that have been handled from the queue." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3647, + "module": "storage_clients.models", + "name": "handled_request_count", + "parsedDocstring": { + "text": "The number of requests that have been handled from the queue." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 67 + } + ], + "type": { + "name": "int", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The number of requests that are still pending in the queue." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3648, + "module": "storage_clients.models", + "name": "pending_request_count", + "parsedDocstring": { + "text": "The number of requests that are still pending in the queue." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 70 + } + ], + "type": { + "name": "int", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The total number of requests that have been added to the queue." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3649, + "module": "storage_clients.models", + "name": "total_request_count", + "parsedDocstring": { + "text": "The total number of requests that have been added to the queue." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 73 + } + ], + "type": { + "name": "int", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The unique identifier of the storage." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3942, + "module": "storage_clients.models", + "name": "id", + "parsedDocstring": { + "text": "The unique identifier of the storage." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 25 + } + ], + "type": { + "name": "Annotated[str, Field(alias='id')]", + "type": "reference" + }, + "inheritedFrom": { + "name": "StorageMetadata.id", + "target": 3634, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The name of the storage." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3943, + "module": "storage_clients.models", + "name": "name", + "parsedDocstring": { + "text": "The name of the storage." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 28 + } + ], + "type": { + "name": "Annotated[str | None, Field(alias='name', default=None)]", + "type": "reference" + }, + "inheritedFrom": { + "name": "StorageMetadata.name", + "target": 3635, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The timestamp when the storage was last accessed." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3944, + "module": "storage_clients.models", + "name": "accessed_at", + "parsedDocstring": { + "text": "The timestamp when the storage was last accessed." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 31 + } + ], + "type": { + "name": "Annotated[datetime, Field(alias='accessedAt')]", + "type": "reference" + }, + "inheritedFrom": { + "name": "StorageMetadata.accessed_at", + "target": 3636, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The timestamp when the storage was created." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3945, + "module": "storage_clients.models", + "name": "created_at", + "parsedDocstring": { + "text": "The timestamp when the storage was created." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 34 + } + ], + "type": { + "name": "Annotated[datetime, Field(alias='createdAt')]", + "type": "reference" + }, + "inheritedFrom": { + "name": "StorageMetadata.created_at", + "target": 3637, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The timestamp when the storage was last modified." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3946, + "module": "storage_clients.models", + "name": "modified_at", + "parsedDocstring": { + "text": "The timestamp when the storage was last modified." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 37 + } + ], + "type": { + "name": "Annotated[datetime, Field(alias='modifiedAt')]", + "type": "reference" + }, + "inheritedFrom": { + "name": "StorageMetadata.modified_at", + "target": 3638, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Model for a request queue metadata." + } + ] + }, + "decorations": [ + { + "args": "('Storage data')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 3944, + 3945, + 3646, + 3647, + 3942, + 3645, + 3946, + 3943, + 3648, + 3649 + ], + "title": "Properties" + } + ], + "id": 3644, + "module": "storage_clients.models", + "name": "RequestQueueMetadata", + "parsedDocstring": { + "text": "Model for a request queue metadata." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 59 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "StorageMetadata", + "target": "3632", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3651, + "module": "storage_clients.models", + "name": "model_config", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 81 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The key of the record.\n\nA unique identifier for the record in the key-value store." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3652, + "module": "storage_clients.models", + "name": "key", + "parsedDocstring": { + "text": "The key of the record.\n\nA unique identifier for the record in the key-value store." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 83 + } + ], + "type": { + "name": "str", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The MIME type of the record.\n\nDescribe the format and type of data stored in the record, following the MIME specification." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3653, + "module": "storage_clients.models", + "name": "content_type", + "parsedDocstring": { + "text": "The MIME type of the record.\n\nDescribe the format and type of data stored in the record, following the MIME specification." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 89 + } + ], + "type": { + "name": "str", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The size of the record in bytes." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3654, + "module": "storage_clients.models", + "name": "size", + "parsedDocstring": { + "text": "The size of the record in bytes." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 95 + } + ], + "type": { + "name": "Annotated[int | None, Field(alias='size', default=None)]", + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Model for a key-value store record metadata." + } + ] + }, + "decorations": [ + { + "args": "('Storage data')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 3653, + 3652, + 3651, + 3654 + ], + "title": "Properties" + } + ], + "id": 3650, + "module": "storage_clients.models", + "name": "KeyValueStoreRecordMetadata", + "parsedDocstring": { + "text": "Model for a key-value store record metadata." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 78 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedBy": [ + { + "name": "KeyValueStoreRecord", + "target": "3655", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3656, + "module": "storage_clients.models", + "name": "model_config", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 103 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "overwrites": { + "name": "KeyValueStoreRecordMetadata.model_config", + "target": 3651, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The value of the record." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3657, + "module": "storage_clients.models", + "name": "value", + "parsedDocstring": { + "text": "The value of the record." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 105 + } + ], + "type": { + "name": "KvsValueType", + "type": "reference", + "target": "3631" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The key of the record.\n\nA unique identifier for the record in the key-value store." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3929, + "module": "storage_clients.models", + "name": "key", + "parsedDocstring": { + "text": "The key of the record.\n\nA unique identifier for the record in the key-value store." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 83 + } + ], + "type": { + "name": "Annotated[str, Field(alias='key')]", + "type": "reference" + }, + "inheritedFrom": { + "name": "KeyValueStoreRecordMetadata.key", + "target": 3652, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The MIME type of the record.\n\nDescribe the format and type of data stored in the record, following the MIME specification." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3930, + "module": "storage_clients.models", + "name": "content_type", + "parsedDocstring": { + "text": "The MIME type of the record.\n\nDescribe the format and type of data stored in the record, following the MIME specification." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 89 + } + ], + "type": { + "name": "Annotated[str, Field(alias='contentType')]", + "type": "reference" + }, + "inheritedFrom": { + "name": "KeyValueStoreRecordMetadata.content_type", + "target": 3653, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The size of the record in bytes." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3931, + "module": "storage_clients.models", + "name": "size", + "parsedDocstring": { + "text": "The size of the record in bytes." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 95 + } + ], + "type": { + "name": "Annotated[int | None, Field(alias='size', default=None)]", + "type": "reference" + }, + "inheritedFrom": { + "name": "KeyValueStoreRecordMetadata.size", + "target": 3654, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Model for a key-value store record." + } + ] + }, + "decorations": [ + { + "args": "('Storage data')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 3930, + 3929, + 3656, + 3931, + 3657 + ], + "title": "Properties" + } + ], + "id": 3655, + "module": "storage_clients.models", + "name": "KeyValueStoreRecord", + "parsedDocstring": { + "text": "Model for a key-value store record." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 100 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "KeyValueStoreRecordMetadata", + "target": "3650", + "type": "reference" + } + ] + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3659, + "module": "storage_clients.models", + "name": "model_config", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 113 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The number of objects returned on this page." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3660, + "module": "storage_clients.models", + "name": "count", + "parsedDocstring": { + "text": "The number of objects returned on this page." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 115 + } + ], + "type": { + "name": "int", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The starting position of the first object returned, as specified in the API call." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3661, + "module": "storage_clients.models", + "name": "offset", + "parsedDocstring": { + "text": "The starting position of the first object returned, as specified in the API call." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 118 + } + ], + "type": { + "name": "int", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The maximum number of objects to return, as specified in the API call." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3662, + "module": "storage_clients.models", + "name": "limit", + "parsedDocstring": { + "text": "The maximum number of objects to return, as specified in the API call." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 121 + } + ], + "type": { + "name": "int", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The total number of objects that match the criteria of the API call." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3663, + "module": "storage_clients.models", + "name": "total", + "parsedDocstring": { + "text": "The total number of objects that match the criteria of the API call." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 124 + } + ], + "type": { + "name": "int", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Indicates if the returned list is in descending order." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3664, + "module": "storage_clients.models", + "name": "desc", + "parsedDocstring": { + "text": "Indicates if the returned list is in descending order." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 127 + } + ], + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Model for a single page of dataset items returned from a collection list method." + } + ] + }, + "decorations": [ + { + "args": "('Storage data')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 3660, + 3664, + 3662, + 3659, + 3661, + 3663 + ], + "title": "Properties" + } + ], + "id": 3658, + "module": "storage_clients.models", + "name": "DatasetItemsListPage", + "parsedDocstring": { + "text": "Model for a single page of dataset items returned from a collection list method." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 110 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3666, + "module": "storage_clients.models", + "name": "model_config", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 143 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Internal representation of the request by the storage client. Only some clients use id." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3667, + "module": "storage_clients.models", + "name": "id", + "parsedDocstring": { + "text": "Internal representation of the request by the storage client. Only some clients use id." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 145 + } + ], + "type": { + "name": "Annotated[str | None, Field(alias='requestId', default=None)]", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3668, + "module": "storage_clients.models", + "name": "unique_key", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 148 + } + ], + "type": { + "name": "str", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3669, + "module": "storage_clients.models", + "name": "was_already_present", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 149 + } + ], + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3670, + "module": "storage_clients.models", + "name": "was_already_handled", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 150 + } + ], + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Represents a processed request." + } + ] + }, + "decorations": [ + { + "args": "('Storage data')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 3667, + 3666, + 3668, + 3670, + 3669 + ], + "title": "Properties" + } + ], + "id": 3665, + "module": "storage_clients.models", + "name": "ProcessedRequest", + "parsedDocstring": { + "text": "Represents a processed request." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 140 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3672, + "module": "storage_clients.models", + "name": "model_config", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 157 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3673, + "module": "storage_clients.models", + "name": "unique_key", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 159 + } + ], + "type": { + "name": "str", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3674, + "module": "storage_clients.models", + "name": "url", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 160 + } + ], + "type": { + "name": "str", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3675, + "module": "storage_clients.models", + "name": "method", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 161 + } + ], + "type": { + "name": "Annotated[HttpMethod | None, Field()]", + "type": "union", + "types": [ + { + "type": "reference", + "name": "HttpMethod", + "target": "300" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Represents an unprocessed request." + } + ] + }, + "decorations": [ + { + "args": "('Storage data')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 3675, + 3672, + 3673, + 3674 + ], + "title": "Properties" + } + ], + "id": 3671, + "module": "storage_clients.models", + "name": "UnprocessedRequest", + "parsedDocstring": { + "text": "Represents an unprocessed request." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 154 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3677, + "module": "storage_clients.models", + "name": "model_config", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 173 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Successfully processed requests, including information about whether they were\nalready present in the queue and whether they had been handled previously." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3678, + "module": "storage_clients.models", + "name": "processed_requests", + "parsedDocstring": { + "text": "Successfully processed requests, including information about whether they were\nalready present in the queue and whether they had been handled previously." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 175 + } + ], + "type": { + "name": "list", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "ProcessedRequest", + "target": "3665" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Requests that could not be processed, typically due to validation errors or other issues." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3679, + "module": "storage_clients.models", + "name": "unprocessed_requests", + "parsedDocstring": { + "text": "Requests that could not be processed, typically due to validation errors or other issues." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 179 + } + ], + "type": { + "name": "list", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "UnprocessedRequest", + "target": "3671" + } + ] + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Model for a response to add requests to a queue.\n\nContains detailed information about the processing results when adding multiple requests\nto a queue. This includes which requests were successfully processed and which ones\nencountered issues during processing." + } + ] + }, + "decorations": [ + { + "args": "('Storage data')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 3677, + 3678, + 3679 + ], + "title": "Properties" + } + ], + "id": 3676, + "module": "storage_clients.models", + "name": "AddRequestsResponse", + "parsedDocstring": { + "text": "Model for a response to add requests to a queue.\n\nContains detailed information about the processing results when adding multiple requests\nto a queue. This includes which requests were successfully processed and which ones\nencountered issues during processing." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storage_clients/models.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 165 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the storage ID." + } + ] + }, + "decorations": [ + { + "name": "property" + }, + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3681, + "module": "storages._base", + "name": "id", + "parsedDocstring": { + "text": "Get the storage ID." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storages/_base.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 20 + } + ], + "type": { + "name": "str", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the storage name." + } + ] + }, + "decorations": [ + { + "name": "property" + }, + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3682, + "module": "storages._base", + "name": "name", + "parsedDocstring": { + "text": "Get the storage name." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storages/_base.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 25 + } + ], + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the storage metadata." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3683, + "module": "storages._base", + "name": "get_metadata", + "parsedDocstring": { + "text": "Get the storage metadata." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storages/_base.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 29 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the storage metadata." + } + ] + }, + "flags": {}, + "id": 3684, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_metadata", + "parameters": [], + "type": { + "name": "DatasetMetadata | KeyValueStoreMetadata | RequestQueueMetadata", + "type": "union", + "types": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "DatasetMetadata", + "target": "3639" + }, + { + "type": "reference", + "name": "KeyValueStoreMetadata", + "target": "3642" + } + ] + }, + { + "type": "reference", + "name": "RequestQueueMetadata", + "target": "3644" + } + ] + }, + "overwrites": { + "name": "Storage.get_metadata", + "target": 3683, + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Open a storage, either restore existing or create a new one.\n" + } + ] + }, + "decorations": [ + { + "name": "classmethod" + }, + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3685, + "module": "storages._base", + "name": "open", + "parsedDocstring": { + "text": "Open a storage, either restore existing or create a new one.\n", + "args": { + "id": "The storage ID.", + "name": "The storage name (global scope, persists across runs). Name can only contain letters \"a\" through \"z\",\nthe digits \"0\" through \"9\", and the hyphen (\"-\") but only in the middle of the string\n(e.g. \"my-value-1\").", + "alias": "The storage alias (run scope, creates unnamed storage).", + "configuration": "Configuration object used during the storage creation or restoration process.", + "storage_client": "Underlying storage client to use. If not provided, the default global storage client\nfrom the service locator will be used." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storages/_base.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 34 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Open a storage, either restore existing or create a new one.\n" + } + ] + }, + "flags": {}, + "id": 3686, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "open", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The storage ID." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 3687, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The storage name (global scope, persists across runs). Name can only contain letters \"a\" through \"z\",\nthe digits \"0\" through \"9\", and the hyphen (\"-\") but only in the middle of the string\n(e.g. \"my-value-1\")." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 3688, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The storage alias (run scope, creates unnamed storage)." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 3689, + "kind": 32768, + "kindString": "Parameter", + "name": "alias", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Configuration object used during the storage creation or restoration process." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 3690, + "kind": 32768, + "kindString": "Parameter", + "name": "configuration", + "type": { + "name": "Configuration | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Configuration", + "target": "235" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Underlying storage client to use. If not provided, the default global storage client\nfrom the service locator will be used." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 3691, + "kind": 32768, + "kindString": "Parameter", + "name": "storage_client", + "type": { + "name": "StorageClient | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "StorageClient", + "target": "2784" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "Storage", + "type": "reference", + "target": "3680" + }, + "overwrites": { + "name": "Storage.open", + "target": 3685, + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Drop the storage, removing it from the underlying storage client and clearing the cache." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3692, + "module": "storages._base", + "name": "drop", + "parsedDocstring": { + "text": "Drop the storage, removing it from the underlying storage client and clearing the cache." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storages/_base.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 57 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Drop the storage, removing it from the underlying storage client and clearing the cache." + } + ] + }, + "flags": {}, + "id": 3693, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "drop", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "RequestManager.drop", + "target": 2340, + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Purge the storage, removing all items from the underlying storage client.\n\nThis method does not remove the storage itself, e.g. don't remove the metadata,\nbut clears all items within it." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3694, + "module": "storages._base", + "name": "purge", + "parsedDocstring": { + "text": "Purge the storage, removing all items from the underlying storage client.\n\nThis method does not remove the storage itself, e.g. don't remove the metadata,\nbut clears all items within it." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storages/_base.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 61 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Purge the storage, removing all items from the underlying storage client.\n\nThis method does not remove the storage itself, e.g. don't remove the metadata,\nbut clears all items within it." + } + ] + }, + "flags": {}, + "id": 3695, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "purge", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "Storage.purge", + "target": 3694, + "type": "reference" + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Base class for storages." + } + ] + }, + "decorations": [ + { + "args": "('Storages')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 3692, + 3683, + 3685, + 3694 + ], + "title": "Methods" + }, + { + "children": [ + 3681, + 3682 + ], + "title": "Properties" + } + ], + "id": 3680, + "module": "storages._base", + "name": "Storage", + "parsedDocstring": { + "text": "Base class for storages." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storages/_base.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 15 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedBy": [ + { + "name": "KeyValueStore", + "target": "3700", + "type": "reference" + }, + { + "name": "Dataset", + "target": "3766", + "type": "reference" + }, + { + "name": "RequestQueue", + "target": "3852", + "type": "reference" + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3696, + "module": "storages._key_value_store", + "name": "T", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storages/_key_value_store.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 30 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3697, + "module": "storages._key_value_store", + "name": "logger", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storages/_key_value_store.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 32 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3699, + "module": "storages._key_value_store", + "name": "root", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storages/_key_value_store.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 36 + } + ], + "type": { + "name": "dict", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "JsonSerializable" + } + ] + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 3699 + ], + "title": "Properties" + } + ], + "id": 3698, + "module": "storages._key_value_store", + "name": "AutosavedValue", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storages/_key_value_store.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 35 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n\nPreferably use the `KeyValueStore.open` constructor to create a new instance.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3701, + "module": "storages._key_value_store", + "name": "__init__", + "parsedDocstring": { + "text": "Initialize a new instance.\n\nPreferably use the `KeyValueStore.open` constructor to create a new instance.\n", + "args": { + "client": "An instance of a storage client.", + "id": "The unique identifier of the storage.", + "name": "The name of the storage, if available." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storages/_key_value_store.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 78 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n\nPreferably use the `KeyValueStore.open` constructor to create a new instance.\n" + } + ] + }, + "flags": {}, + "id": 3702, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "An instance of a storage client." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 3703, + "kind": 32768, + "kindString": "Parameter", + "name": "client", + "type": { + "name": "KeyValueStoreClient", + "type": "reference", + "target": "2842" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The unique identifier of the storage." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 3704, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The name of the storage, if available." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 3705, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the storage ID." + } + ] + }, + "decorations": [ + { + "name": "property" + }, + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3706, + "module": "storages._base", + "name": "id", + "parsedDocstring": { + "text": "Get the storage ID." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storages/_key_value_store.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 99 + } + ], + "type": { + "name": "str", + "type": "reference" + }, + "overwrites": { + "name": "Storage.id", + "target": 3681, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the storage name." + } + ] + }, + "decorations": [ + { + "name": "property" + }, + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3707, + "module": "storages._base", + "name": "name", + "parsedDocstring": { + "text": "Get the storage name." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storages/_key_value_store.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 104 + } + ], + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + }, + "overwrites": { + "name": "Storage.name", + "target": 3682, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the storage metadata." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3708, + "module": "storages._base", + "name": "get_metadata", + "parsedDocstring": { + "text": "Get the storage metadata." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storages/_key_value_store.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 108 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the storage metadata." + } + ] + }, + "flags": {}, + "id": 3684, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_metadata", + "parameters": [], + "type": { + "name": "DatasetMetadata | KeyValueStoreMetadata | RequestQueueMetadata", + "type": "union", + "types": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "DatasetMetadata", + "target": "3639" + }, + { + "type": "reference", + "name": "KeyValueStoreMetadata", + "target": "3642" + } + ] + }, + { + "type": "reference", + "name": "RequestQueueMetadata", + "target": "3644" + } + ] + }, + "overwrites": { + "name": "Storage.get_metadata", + "target": 3683, + "type": "reference" + } + } + ], + "overwrites": { + "name": "Storage.get_metadata", + "target": 3683, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Open a storage, either restore existing or create a new one.\n" + } + ] + }, + "decorations": [ + { + "name": "classmethod" + }, + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3710, + "module": "storages._base", + "name": "open", + "parsedDocstring": { + "text": "Open a storage, either restore existing or create a new one.\n", + "args": { + "id": "The storage ID.", + "name": "The storage name (global scope, persists across runs). Name can only contain letters \"a\" through \"z\",\nthe digits \"0\" through \"9\", and the hyphen (\"-\") but only in the middle of the string\n(e.g. \"my-value-1\").", + "alias": "The storage alias (run scope, creates unnamed storage).", + "configuration": "Configuration object used during the storage creation or restoration process.", + "storage_client": "Underlying storage client to use. If not provided, the default global storage client\nfrom the service locator will be used." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storages/_key_value_store.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 113 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Open a storage, either restore existing or create a new one.\n" + } + ] + }, + "flags": {}, + "id": 3686, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "open", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The storage ID." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 3687, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The storage name (global scope, persists across runs). Name can only contain letters \"a\" through \"z\",\nthe digits \"0\" through \"9\", and the hyphen (\"-\") but only in the middle of the string\n(e.g. \"my-value-1\")." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 3688, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The storage alias (run scope, creates unnamed storage)." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 3689, + "kind": 32768, + "kindString": "Parameter", + "name": "alias", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Configuration object used during the storage creation or restoration process." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 3690, + "kind": 32768, + "kindString": "Parameter", + "name": "configuration", + "type": { + "name": "Configuration | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Configuration", + "target": "235" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Underlying storage client to use. If not provided, the default global storage client\nfrom the service locator will be used." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 3691, + "kind": 32768, + "kindString": "Parameter", + "name": "storage_client", + "type": { + "name": "StorageClient | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "StorageClient", + "target": "2784" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "Storage", + "type": "reference", + "target": "3680" + }, + "overwrites": { + "name": "Storage.open", + "target": 3685, + "type": "reference" + } + } + ], + "overwrites": { + "name": "Storage.open", + "target": 3685, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Drop the storage, removing it from the underlying storage client and clearing the cache." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3717, + "module": "storages._base", + "name": "drop", + "parsedDocstring": { + "text": "Drop the storage, removing it from the underlying storage client and clearing the cache." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storages/_key_value_store.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 140 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Drop the storage, removing it from the underlying storage client and clearing the cache." + } + ] + }, + "flags": {}, + "id": 3693, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "drop", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "RequestManager.drop", + "target": 2340, + "type": "reference" + } + } + ], + "overwrites": { + "name": "Storage.drop", + "target": 3692, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Purge the storage, removing all items from the underlying storage client.\n\nThis method does not remove the storage itself, e.g. don't remove the metadata,\nbut clears all items within it." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3719, + "module": "storages._base", + "name": "purge", + "parsedDocstring": { + "text": "Purge the storage, removing all items from the underlying storage client.\n\nThis method does not remove the storage itself, e.g. don't remove the metadata,\nbut clears all items within it." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storages/_key_value_store.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 148 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Purge the storage, removing all items from the underlying storage client.\n\nThis method does not remove the storage itself, e.g. don't remove the metadata,\nbut clears all items within it." + } + ] + }, + "flags": {}, + "id": 3695, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "purge", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "Storage.purge", + "target": 3694, + "type": "reference" + } + } + ], + "overwrites": { + "name": "Storage.purge", + "target": 3694, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get a value from the KVS.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3721, + "module": "storages._key_value_store", + "name": "get_value", + "parsedDocstring": { + "text": "Get a value from the KVS.\n", + "args": { + "key": "Key of the record to retrieve.", + "default_value": "Default value returned in case the record does not exist.\n" + }, + "returns": "The value associated with the given key. `default_value` is used in case the record does not exist." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storages/_key_value_store.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 160 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "The value associated with the given key. `default_value` is used in case the record does not exist." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Get a value from the KVS.\n" + } + ] + }, + "flags": {}, + "id": 3722, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_value", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Key of the record to retrieve." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 3723, + "kind": 32768, + "kindString": "Parameter", + "name": "key", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Default value returned in case the record does not exist.\n" + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 3724, + "kind": 32768, + "kindString": "Parameter", + "name": "default_value", + "type": { + "name": "T | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "T", + "target": "299" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "T | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "T", + "target": "299" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "The value associated with the given key. `default_value` is used in case the record does not exist." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Get a value from the KVS.\n" + } + ] + }, + "flags": {}, + "id": 3753, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_value", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Key of the record to retrieve." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 3754, + "kind": 32768, + "kindString": "Parameter", + "name": "key", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "Any", + "type": "reference" + } + }, + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "The value associated with the given key. `default_value` is used in case the record does not exist." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Get a value from the KVS.\n" + } + ] + }, + "flags": {}, + "id": 3755, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_value", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Key of the record to retrieve." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 3756, + "kind": 32768, + "kindString": "Parameter", + "name": "key", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Default value returned in case the record does not exist.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 3757, + "kind": 32768, + "kindString": "Parameter", + "name": "default_value", + "type": { + "name": "T", + "type": "reference", + "target": "299" + } + } + ], + "type": { + "name": "T", + "type": "reference", + "target": "299" + } + }, + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "The value associated with the given key. `default_value` is used in case the record does not exist." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Get a value from the KVS.\n" + } + ] + }, + "flags": {}, + "id": 3758, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_value", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Key of the record to retrieve." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 3759, + "kind": 32768, + "kindString": "Parameter", + "name": "key", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Default value returned in case the record does not exist.\n" + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 3760, + "kind": 32768, + "kindString": "Parameter", + "name": "default_value", + "type": { + "name": "T | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "T", + "target": "299" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "T | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "T", + "target": "299" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Set a value in the KVS.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3725, + "module": "storages._key_value_store", + "name": "set_value", + "parsedDocstring": { + "text": "Set a value in the KVS.\n", + "args": { + "key": "Key of the record to set.", + "value": "Value to set.", + "content_type": "The MIME content type string." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storages/_key_value_store.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 173 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Set a value in the KVS.\n" + } + ] + }, + "flags": {}, + "id": 3726, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "set_value", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Key of the record to set." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 3727, + "kind": 32768, + "kindString": "Parameter", + "name": "key", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Value to set." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 3728, + "kind": 32768, + "kindString": "Parameter", + "name": "value", + "type": { + "name": "Any", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The MIME content type string." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 3729, + "kind": 32768, + "kindString": "Parameter", + "name": "content_type", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Delete a value from the KVS.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3730, + "module": "storages._key_value_store", + "name": "delete_value", + "parsedDocstring": { + "text": "Delete a value from the KVS.\n", + "args": { + "key": "Key of the record to delete." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storages/_key_value_store.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 188 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Delete a value from the KVS.\n" + } + ] + }, + "flags": {}, + "id": 3731, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "delete_value", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Key of the record to delete." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 3732, + "kind": 32768, + "kindString": "Parameter", + "name": "key", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Iterate over the existing keys in the KVS.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3733, + "module": "storages._key_value_store", + "name": "iterate_keys", + "parsedDocstring": { + "text": "Iterate over the existing keys in the KVS.\n", + "args": { + "exclusive_start_key": "Key to start the iteration from.", + "limit": "Maximum number of keys to return. None means no limit.\n" + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storages/_key_value_store.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 196 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Iterate over the existing keys in the KVS.\n" + } + ] + }, + "flags": {}, + "id": 3734, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "iterate_keys", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Key to start the iteration from." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 3735, + "kind": 32768, + "kindString": "Parameter", + "name": "exclusive_start_key", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum number of keys to return. None means no limit.\n" + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 3736, + "kind": 32768, + "kindString": "Parameter", + "name": "limit", + "type": { + "name": "int | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "AsyncIterator", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "KeyValueStoreRecordMetadata", + "target": "3650" + } + ] + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "List all the existing keys in the KVS.\n\nIt uses client's `iterate_keys` method to get the keys.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3737, + "module": "storages._key_value_store", + "name": "list_keys", + "parsedDocstring": { + "text": "List all the existing keys in the KVS.\n\nIt uses client's `iterate_keys` method to get the keys.\n", + "args": { + "exclusive_start_key": "Key to start the iteration from.", + "limit": "Maximum number of keys to return.\n" + }, + "returns": "A list of keys in the KVS." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storages/_key_value_store.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 216 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "A list of keys in the KVS." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "List all the existing keys in the KVS.\n\nIt uses client's `iterate_keys` method to get the keys.\n" + } + ] + }, + "flags": {}, + "id": 3738, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "list_keys", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Key to start the iteration from." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 3739, + "kind": 32768, + "kindString": "Parameter", + "name": "exclusive_start_key", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Maximum number of keys to return.\n" + } + ] + }, + "defaultValue": "1000", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 3740, + "kind": 32768, + "kindString": "Parameter", + "name": "limit", + "type": { + "name": "int", + "type": "reference" + } + } + ], + "type": { + "name": "list", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "KeyValueStoreRecordMetadata", + "target": "3650" + } + ] + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Check if a record with the given key exists in the key-value store.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3741, + "module": "storages._key_value_store", + "name": "record_exists", + "parsedDocstring": { + "text": "Check if a record with the given key exists in the key-value store.\n", + "args": { + "key": "Key of the record to check for existence.\n" + }, + "returns": "True if a record with the given key exists, False otherwise." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storages/_key_value_store.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 240 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "True if a record with the given key exists, False otherwise." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Check if a record with the given key exists in the key-value store.\n" + } + ] + }, + "flags": {}, + "id": 3742, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "record_exists", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Key of the record to check for existence.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 3743, + "kind": 32768, + "kindString": "Parameter", + "name": "key", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "bool", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the public URL for the given key.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3744, + "module": "storages._key_value_store", + "name": "get_public_url", + "parsedDocstring": { + "text": "Get the public URL for the given key.\n", + "args": { + "key": "Key of the record for which URL is required.\n" + }, + "returns": "The public URL for the given key." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storages/_key_value_store.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 251 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "The public URL for the given key." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Get the public URL for the given key.\n" + } + ] + }, + "flags": {}, + "id": 3745, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_public_url", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Key of the record for which URL is required.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 3746, + "kind": 32768, + "kindString": "Parameter", + "name": "key", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "str", + "type": "reference" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get a value from KVS that will be automatically saved on changes.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3747, + "module": "storages._key_value_store", + "name": "get_auto_saved_value", + "parsedDocstring": { + "text": "Get a value from KVS that will be automatically saved on changes.\n", + "args": { + "key": "Key of the record, to store the value.", + "default_value": "Value to be used if the record does not exist yet. Should be a dictionary.\n" + }, + "returns": "Return the value of the key." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storages/_key_value_store.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 262 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "Return the value of the key." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Get a value from KVS that will be automatically saved on changes.\n" + } + ] + }, + "flags": {}, + "id": 3748, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_auto_saved_value", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Key of the record, to store the value." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 3749, + "kind": 32768, + "kindString": "Parameter", + "name": "key", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Value to be used if the record does not exist yet. Should be a dictionary.\n" + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 3750, + "kind": 32768, + "kindString": "Parameter", + "name": "default_value", + "type": { + "name": "dict[str, JsonSerializable] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "dict", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "JsonSerializable" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "dict", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "JsonSerializable" + } + ] + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Force autosaved values to be saved without waiting for an event in Event Manager." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3751, + "module": "storages._key_value_store", + "name": "persist_autosaved_values", + "parsedDocstring": { + "text": "Force autosaved values to be saved without waiting for an event in Event Manager." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storages/_key_value_store.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 299 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Force autosaved values to be saved without waiting for an event in Event Manager." + } + ] + }, + "flags": {}, + "id": 3752, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "persist_autosaved_values", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Key-value store is a storage for reading and writing data records with unique key identifiers.\n\nThe key-value store class acts as a high-level interface for storing, retrieving, and managing data records\nidentified by unique string keys. It abstracts away the underlying storage implementation details,\nallowing you to work with the same API regardless of whether data is stored in memory, on disk,\nor in the cloud.\n\nEach data record is associated with a specific MIME content type, allowing storage of various\ndata formats such as JSON, text, images, HTML snapshots or any binary data. This class is\ncommonly used to store inputs, outputs, and other artifacts of crawler operations.\n\nYou can instantiate a key-value store using the `open` class method, which will create a store\nwith the specified name or id. The underlying storage implementation is determined by the configured\nstorage client.\n\n### Usage\n\n```python\nfrom crawlee.storages import KeyValueStore\n\n# Open a named key-value store\nkvs = await KeyValueStore.open(name='my-store')\n\n# Store and retrieve data\nawait kvs.set_value('product-1234.json', [{'name': 'Smartphone', 'price': 799.99}])\nproduct = await kvs.get_value('product-1234')\n```" + } + ] + }, + "decorations": [ + { + "args": "('Storages')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 3701, + 3730, + 3717, + 3747, + 3708, + 3744, + 3721, + 3733, + 3737, + 3710, + 3751, + 3719, + 3741, + 3725 + ], + "title": "Methods" + }, + { + "children": [ + 3706, + 3707 + ], + "title": "Properties" + } + ], + "id": 3700, + "module": "storages._key_value_store", + "name": "KeyValueStore", + "parsedDocstring": { + "text": "Key-value store is a storage for reading and writing data records with unique key identifiers.\n\nThe key-value store class acts as a high-level interface for storing, retrieving, and managing data records\nidentified by unique string keys. It abstracts away the underlying storage implementation details,\nallowing you to work with the same API regardless of whether data is stored in memory, on disk,\nor in the cloud.\n\nEach data record is associated with a specific MIME content type, allowing storage of various\ndata formats such as JSON, text, images, HTML snapshots or any binary data. This class is\ncommonly used to store inputs, outputs, and other artifacts of crawler operations.\n\nYou can instantiate a key-value store using the `open` class method, which will create a store\nwith the specified name or id. The underlying storage implementation is determined by the configured\nstorage client.\n\n### Usage\n\n```python\nfrom crawlee.storages import KeyValueStore\n\n# Open a named key-value store\nkvs = await KeyValueStore.open(name='my-store')\n\n# Store and retrieve data\nawait kvs.set_value('product-1234.json', [{'name': 'Smartphone', 'price': 799.99}])\nproduct = await kvs.get_value('product-1234')\n```" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storages/_key_value_store.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 40 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "Storage", + "target": "3680", + "type": "reference" + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3761, + "module": "storages._utils", + "name": "NAME_REGEX", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storages/_utils.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 3 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3762, + "module": "storages._utils", + "name": "validate_storage_name", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storages/_utils.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 6 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 3763, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "validate_storage_name", + "parameters": [ + { + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 3764, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3765, + "module": "storages._dataset", + "name": "logger", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 29 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n\nPreferably use the `Dataset.open` constructor to create a new instance.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3767, + "module": "storages._dataset", + "name": "__init__", + "parsedDocstring": { + "text": "Initialize a new instance.\n\nPreferably use the `Dataset.open` constructor to create a new instance.\n", + "args": { + "client": "An instance of a storage client.", + "id": "The unique identifier of the storage.", + "name": "The name of the storage, if available." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 69 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n\nPreferably use the `Dataset.open` constructor to create a new instance.\n" + } + ] + }, + "flags": {}, + "id": 3768, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "An instance of a storage client." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 3769, + "kind": 32768, + "kindString": "Parameter", + "name": "client", + "type": { + "name": "DatasetClient", + "type": "reference", + "target": "2808" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The unique identifier of the storage." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 3770, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The name of the storage, if available." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 3771, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the storage ID." + } + ] + }, + "decorations": [ + { + "name": "property" + }, + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3772, + "module": "storages._base", + "name": "id", + "parsedDocstring": { + "text": "Get the storage ID." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 87 + } + ], + "type": { + "name": "str", + "type": "reference" + }, + "overwrites": { + "name": "Storage.id", + "target": 3681, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the storage name." + } + ] + }, + "decorations": [ + { + "name": "property" + }, + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3773, + "module": "storages._base", + "name": "name", + "parsedDocstring": { + "text": "Get the storage name." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 92 + } + ], + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + }, + "overwrites": { + "name": "Storage.name", + "target": 3682, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the storage metadata." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3774, + "module": "storages._base", + "name": "get_metadata", + "parsedDocstring": { + "text": "Get the storage metadata." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 96 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the storage metadata." + } + ] + }, + "flags": {}, + "id": 3684, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_metadata", + "parameters": [], + "type": { + "name": "DatasetMetadata | KeyValueStoreMetadata | RequestQueueMetadata", + "type": "union", + "types": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "DatasetMetadata", + "target": "3639" + }, + { + "type": "reference", + "name": "KeyValueStoreMetadata", + "target": "3642" + } + ] + }, + { + "type": "reference", + "name": "RequestQueueMetadata", + "target": "3644" + } + ] + }, + "overwrites": { + "name": "Storage.get_metadata", + "target": 3683, + "type": "reference" + } + } + ], + "overwrites": { + "name": "Storage.get_metadata", + "target": 3683, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Open a storage, either restore existing or create a new one.\n" + } + ] + }, + "decorations": [ + { + "name": "classmethod" + }, + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3776, + "module": "storages._base", + "name": "open", + "parsedDocstring": { + "text": "Open a storage, either restore existing or create a new one.\n", + "args": { + "id": "The storage ID.", + "name": "The storage name (global scope, persists across runs). Name can only contain letters \"a\" through \"z\",\nthe digits \"0\" through \"9\", and the hyphen (\"-\") but only in the middle of the string\n(e.g. \"my-value-1\").", + "alias": "The storage alias (run scope, creates unnamed storage).", + "configuration": "Configuration object used during the storage creation or restoration process.", + "storage_client": "Underlying storage client to use. If not provided, the default global storage client\nfrom the service locator will be used." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 101 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Open a storage, either restore existing or create a new one.\n" + } + ] + }, + "flags": {}, + "id": 3686, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "open", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The storage ID." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 3687, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The storage name (global scope, persists across runs). Name can only contain letters \"a\" through \"z\",\nthe digits \"0\" through \"9\", and the hyphen (\"-\") but only in the middle of the string\n(e.g. \"my-value-1\")." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 3688, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The storage alias (run scope, creates unnamed storage)." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 3689, + "kind": 32768, + "kindString": "Parameter", + "name": "alias", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Configuration object used during the storage creation or restoration process." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 3690, + "kind": 32768, + "kindString": "Parameter", + "name": "configuration", + "type": { + "name": "Configuration | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Configuration", + "target": "235" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Underlying storage client to use. If not provided, the default global storage client\nfrom the service locator will be used." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 3691, + "kind": 32768, + "kindString": "Parameter", + "name": "storage_client", + "type": { + "name": "StorageClient | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "StorageClient", + "target": "2784" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "Storage", + "type": "reference", + "target": "3680" + }, + "overwrites": { + "name": "Storage.open", + "target": 3685, + "type": "reference" + } + } + ], + "overwrites": { + "name": "Storage.open", + "target": 3685, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Drop the storage, removing it from the underlying storage client and clearing the cache." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3783, + "module": "storages._base", + "name": "drop", + "parsedDocstring": { + "text": "Drop the storage, removing it from the underlying storage client and clearing the cache." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 128 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Drop the storage, removing it from the underlying storage client and clearing the cache." + } + ] + }, + "flags": {}, + "id": 3693, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "drop", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "RequestManager.drop", + "target": 2340, + "type": "reference" + } + } + ], + "overwrites": { + "name": "Storage.drop", + "target": 3692, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Purge the storage, removing all items from the underlying storage client.\n\nThis method does not remove the storage itself, e.g. don't remove the metadata,\nbut clears all items within it." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3785, + "module": "storages._base", + "name": "purge", + "parsedDocstring": { + "text": "Purge the storage, removing all items from the underlying storage client.\n\nThis method does not remove the storage itself, e.g. don't remove the metadata,\nbut clears all items within it." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 134 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Purge the storage, removing all items from the underlying storage client.\n\nThis method does not remove the storage itself, e.g. don't remove the metadata,\nbut clears all items within it." + } + ] + }, + "flags": {}, + "id": 3695, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "purge", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "Storage.purge", + "target": 3694, + "type": "reference" + } + } + ], + "overwrites": { + "name": "Storage.purge", + "target": 3694, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Store an object or an array of objects to the dataset.\n\nThe size of the data is limited by the receiving API and therefore `push_data()` will only\nallow objects whose JSON representation is smaller than 9MB. When an array is passed,\nnone of the included objects may be larger than 9MB, but the array itself may be of any size.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3787, + "module": "storages._dataset", + "name": "push_data", + "parsedDocstring": { + "text": "Store an object or an array of objects to the dataset.\n\nThe size of the data is limited by the receiving API and therefore `push_data()` will only\nallow objects whose JSON representation is smaller than 9MB. When an array is passed,\nnone of the included objects may be larger than 9MB, but the array itself may be of any size.\n", + "args": { + "data": "A JSON serializable data structure to be stored in the dataset. The JSON representation\nof each item must be smaller than 9MB." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 137 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Store an object or an array of objects to the dataset.\n\nThe size of the data is limited by the receiving API and therefore `push_data()` will only\nallow objects whose JSON representation is smaller than 9MB. When an array is passed,\nnone of the included objects may be larger than 9MB, but the array itself may be of any size.\n" + } + ] + }, + "flags": {}, + "id": 3788, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "push_data", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "A JSON serializable data structure to be stored in the dataset. The JSON representation\nof each item must be smaller than 9MB." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 3789, + "kind": 32768, + "kindString": "Parameter", + "name": "data", + "type": { + "name": "list[dict[str, Any]] | dict[str, Any]", + "type": "union", + "types": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "dict", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Any" + } + ] + } + ] + }, + { + "type": "reference", + "name": "dict", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Any" + } + ] + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Retrieve a paginated list of items from a dataset based on various filtering parameters.\n\nThis method provides the flexibility to filter, sort, and modify the appearance of dataset items\nwhen listed. Each parameter modifies the result set according to its purpose. The method also\nsupports pagination through 'offset' and 'limit' parameters.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3790, + "module": "storages._dataset", + "name": "get_data", + "parsedDocstring": { + "text": "Retrieve a paginated list of items from a dataset based on various filtering parameters.\n\nThis method provides the flexibility to filter, sort, and modify the appearance of dataset items\nwhen listed. Each parameter modifies the result set according to its purpose. The method also\nsupports pagination through 'offset' and 'limit' parameters.\n", + "args": { + "offset": "Skips the specified number of items at the start.", + "limit": "The maximum number of items to retrieve. Unlimited if None.", + "clean": "Return only non-empty items and excludes hidden fields. Shortcut for skip_hidden and skip_empty.", + "desc": "Set to True to sort results in descending order.", + "fields": "Fields to include in each item. Sorts fields as specified if provided.", + "omit": "Fields to exclude from each item.", + "unwind": "Unwinds items by a specified array field, turning each element into a separate item.", + "skip_empty": "Excludes empty items from the results if True.", + "skip_hidden": "Excludes fields starting with '#' if True.", + "flatten": "Fields to be flattened in returned items.", + "view": "Specifies the dataset view to be used.\n" + }, + "returns": "An object with filtered, sorted, and paginated dataset items plus pagination details." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 150 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "An object with filtered, sorted, and paginated dataset items plus pagination details." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Retrieve a paginated list of items from a dataset based on various filtering parameters.\n\nThis method provides the flexibility to filter, sort, and modify the appearance of dataset items\nwhen listed. Each parameter modifies the result set according to its purpose. The method also\nsupports pagination through 'offset' and 'limit' parameters.\n" + } + ] + }, + "flags": {}, + "id": 3791, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_data", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Skips the specified number of items at the start." + } + ] + }, + "defaultValue": "0", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 3792, + "kind": 32768, + "kindString": "Parameter", + "name": "offset", + "type": { + "name": "int", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The maximum number of items to retrieve. Unlimited if None." + } + ] + }, + "defaultValue": "999_999_999_999", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 3793, + "kind": 32768, + "kindString": "Parameter", + "name": "limit", + "type": { + "name": "int | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return only non-empty items and excludes hidden fields. Shortcut for skip_hidden and skip_empty." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 3794, + "kind": 32768, + "kindString": "Parameter", + "name": "clean", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Set to True to sort results in descending order." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 3795, + "kind": 32768, + "kindString": "Parameter", + "name": "desc", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Fields to include in each item. Sorts fields as specified if provided." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 3796, + "kind": 32768, + "kindString": "Parameter", + "name": "fields", + "type": { + "name": "list[str] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Fields to exclude from each item." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 3797, + "kind": 32768, + "kindString": "Parameter", + "name": "omit", + "type": { + "name": "list[str] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Unwinds items by a specified array field, turning each element into a separate item." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 3798, + "kind": 32768, + "kindString": "Parameter", + "name": "unwind", + "type": { + "name": "list[str] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Excludes empty items from the results if True." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 3799, + "kind": 32768, + "kindString": "Parameter", + "name": "skip_empty", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Excludes fields starting with '#' if True." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 3800, + "kind": 32768, + "kindString": "Parameter", + "name": "skip_hidden", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Fields to be flattened in returned items." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 3801, + "kind": 32768, + "kindString": "Parameter", + "name": "flatten", + "type": { + "name": "list[str] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Specifies the dataset view to be used.\n" + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 3802, + "kind": 32768, + "kindString": "Parameter", + "name": "view", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "DatasetItemsListPage", + "type": "reference", + "target": "3658" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Iterate over items in the dataset according to specified filters and sorting.\n\nThis method allows for asynchronously iterating through dataset items while applying various filters such as\nskipping empty items, hiding specific fields, and sorting. It supports pagination via `offset` and `limit`\nparameters, and can modify the appearance of dataset items using `fields`, `omit`, `unwind`, `skip_empty`, and\n`skip_hidden` parameters.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3803, + "module": "storages._dataset", + "name": "iterate_items", + "parsedDocstring": { + "text": "Iterate over items in the dataset according to specified filters and sorting.\n\nThis method allows for asynchronously iterating through dataset items while applying various filters such as\nskipping empty items, hiding specific fields, and sorting. It supports pagination via `offset` and `limit`\nparameters, and can modify the appearance of dataset items using `fields`, `omit`, `unwind`, `skip_empty`, and\n`skip_hidden` parameters.\n", + "args": { + "offset": "Skips the specified number of items at the start.", + "limit": "The maximum number of items to retrieve. Unlimited if None.", + "clean": "Return only non-empty items and excludes hidden fields. Shortcut for skip_hidden and skip_empty.", + "desc": "Set to True to sort results in descending order.", + "fields": "Fields to include in each item. Sorts fields as specified if provided.", + "omit": "Fields to exclude from each item.", + "unwind": "Unwinds items by a specified array field, turning each element into a separate item.", + "skip_empty": "Excludes empty items from the results if True.", + "skip_hidden": "Excludes fields starting with '#' if True.\n" + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 201 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Iterate over items in the dataset according to specified filters and sorting.\n\nThis method allows for asynchronously iterating through dataset items while applying various filters such as\nskipping empty items, hiding specific fields, and sorting. It supports pagination via `offset` and `limit`\nparameters, and can modify the appearance of dataset items using `fields`, `omit`, `unwind`, `skip_empty`, and\n`skip_hidden` parameters.\n" + } + ] + }, + "flags": {}, + "id": 3804, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "iterate_items", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Skips the specified number of items at the start." + } + ] + }, + "defaultValue": "0", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 3805, + "kind": 32768, + "kindString": "Parameter", + "name": "offset", + "type": { + "name": "int", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The maximum number of items to retrieve. Unlimited if None." + } + ] + }, + "defaultValue": "999_999_999_999", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 3806, + "kind": 32768, + "kindString": "Parameter", + "name": "limit", + "type": { + "name": "int | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return only non-empty items and excludes hidden fields. Shortcut for skip_hidden and skip_empty." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 3807, + "kind": 32768, + "kindString": "Parameter", + "name": "clean", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Set to True to sort results in descending order." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 3808, + "kind": 32768, + "kindString": "Parameter", + "name": "desc", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Fields to include in each item. Sorts fields as specified if provided." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 3809, + "kind": 32768, + "kindString": "Parameter", + "name": "fields", + "type": { + "name": "list[str] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Fields to exclude from each item." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 3810, + "kind": 32768, + "kindString": "Parameter", + "name": "omit", + "type": { + "name": "list[str] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Unwinds items by a specified array field, turning each element into a separate item." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 3811, + "kind": 32768, + "kindString": "Parameter", + "name": "unwind", + "type": { + "name": "list[str] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Excludes empty items from the results if True." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 3812, + "kind": 32768, + "kindString": "Parameter", + "name": "skip_empty", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Excludes fields starting with '#' if True.\n" + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 3813, + "kind": 32768, + "kindString": "Parameter", + "name": "skip_hidden", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "AsyncIterator", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "dict", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Any" + } + ] + } + ] + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Retrieve a list of all items from the dataset according to specified filters and sorting.\n\nThis method collects all dataset items into a list while applying various filters such as\nskipping empty items, hiding specific fields, and sorting. It supports pagination via `offset` and `limit`\nparameters, and can modify the appearance of dataset items using `fields`, `omit`, `unwind`, `skip_empty`, and\n`skip_hidden` parameters.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3814, + "module": "storages._dataset", + "name": "list_items", + "parsedDocstring": { + "text": "Retrieve a list of all items from the dataset according to specified filters and sorting.\n\nThis method collects all dataset items into a list while applying various filters such as\nskipping empty items, hiding specific fields, and sorting. It supports pagination via `offset` and `limit`\nparameters, and can modify the appearance of dataset items using `fields`, `omit`, `unwind`, `skip_empty`, and\n`skip_hidden` parameters.\n", + "args": { + "offset": "Skips the specified number of items at the start.", + "limit": "The maximum number of items to retrieve. Unlimited if None.", + "clean": "Return only non-empty items and excludes hidden fields. Shortcut for skip_hidden and skip_empty.", + "desc": "Set to True to sort results in descending order.", + "fields": "Fields to include in each item. Sorts fields as specified if provided.", + "omit": "Fields to exclude from each item.", + "unwind": "Unwinds items by a specified array field, turning each element into a separate item.", + "skip_empty": "Excludes empty items from the results if True.", + "skip_hidden": "Excludes fields starting with '#' if True.\n" + }, + "returns": "A list of dictionary objects, each representing a dataset item after applying\nthe specified filters and transformations." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 249 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "A list of dictionary objects, each representing a dataset item after applying\nthe specified filters and transformations." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Retrieve a list of all items from the dataset according to specified filters and sorting.\n\nThis method collects all dataset items into a list while applying various filters such as\nskipping empty items, hiding specific fields, and sorting. It supports pagination via `offset` and `limit`\nparameters, and can modify the appearance of dataset items using `fields`, `omit`, `unwind`, `skip_empty`, and\n`skip_hidden` parameters.\n" + } + ] + }, + "flags": {}, + "id": 3815, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "list_items", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Skips the specified number of items at the start." + } + ] + }, + "defaultValue": "0", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 3816, + "kind": 32768, + "kindString": "Parameter", + "name": "offset", + "type": { + "name": "int", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The maximum number of items to retrieve. Unlimited if None." + } + ] + }, + "defaultValue": "999_999_999_999", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 3817, + "kind": 32768, + "kindString": "Parameter", + "name": "limit", + "type": { + "name": "int | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "int" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return only non-empty items and excludes hidden fields. Shortcut for skip_hidden and skip_empty." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 3818, + "kind": 32768, + "kindString": "Parameter", + "name": "clean", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Set to True to sort results in descending order." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 3819, + "kind": 32768, + "kindString": "Parameter", + "name": "desc", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Fields to include in each item. Sorts fields as specified if provided." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 3820, + "kind": 32768, + "kindString": "Parameter", + "name": "fields", + "type": { + "name": "list[str] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Fields to exclude from each item." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 3821, + "kind": 32768, + "kindString": "Parameter", + "name": "omit", + "type": { + "name": "list[str] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Unwinds items by a specified array field, turning each element into a separate item." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 3822, + "kind": 32768, + "kindString": "Parameter", + "name": "unwind", + "type": { + "name": "list[str] | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "list", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Excludes empty items from the results if True." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 3823, + "kind": 32768, + "kindString": "Parameter", + "name": "skip_empty", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Excludes fields starting with '#' if True.\n" + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 3824, + "kind": 32768, + "kindString": "Parameter", + "name": "skip_hidden", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "list", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "dict", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Any" + } + ] + } + ] + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Export the entire dataset into a specified file stored under a key in a key-value store.\n\nThis method consolidates all entries from a specified dataset into one file, which is then saved under a\ngiven key in a key-value store. The format of the exported file is determined by the `content_type` parameter.\nEither the dataset's ID or name should be specified, and similarly, either the target key-value store's ID or\nname should be used.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3825, + "module": "storages._dataset", + "name": "export_to", + "parsedDocstring": { + "text": "Export the entire dataset into a specified file stored under a key in a key-value store.\n\nThis method consolidates all entries from a specified dataset into one file, which is then saved under a\ngiven key in a key-value store. The format of the exported file is determined by the `content_type` parameter.\nEither the dataset's ID or name should be specified, and similarly, either the target key-value store's ID or\nname should be used.\n", + "args": { + "key": "The key under which to save the data in the key-value store.", + "content_type": "The format in which to export the data.", + "to_kvs_id": "ID of the key-value store to save the exported file.\nSpecify only one of ID or name.", + "to_kvs_name": "Name of the key-value store to save the exported file.\nSpecify only one of ID or name.", + "to_kvs_storage_client": "Storage client to use for the key-value store.", + "to_kvs_configuration": "Configuration for the key-value store.", + "kwargs": "Additional parameters for the export operation, specific to the chosen content type." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 323 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Export the entire dataset into a specified file stored under a key in a key-value store.\n\nThis method consolidates all entries from a specified dataset into one file, which is then saved under a\ngiven key in a key-value store. The format of the exported file is determined by the `content_type` parameter.\nEither the dataset's ID or name should be specified, and similarly, either the target key-value store's ID or\nname should be used.\n" + } + ] + }, + "flags": {}, + "id": 3826, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "export_to", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The key under which to save the data in the key-value store." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 3827, + "kind": 32768, + "kindString": "Parameter", + "name": "key", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The format in which to export the data." + } + ] + }, + "defaultValue": "'json'", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 3828, + "kind": 32768, + "kindString": "Parameter", + "name": "content_type", + "type": { + "name": "Literal", + "type": "reference", + "typeArguments": [ + { + "type": "literal", + "value": "json" + }, + { + "type": "literal", + "value": "csv" + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "ID of the key-value store to save the exported file.\nSpecify only one of ID or name." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 3829, + "kind": 32768, + "kindString": "Parameter", + "name": "to_kvs_id", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Name of the key-value store to save the exported file.\nSpecify only one of ID or name." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 3830, + "kind": 32768, + "kindString": "Parameter", + "name": "to_kvs_name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Storage client to use for the key-value store." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 3831, + "kind": 32768, + "kindString": "Parameter", + "name": "to_kvs_storage_client", + "type": { + "name": "StorageClient | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "StorageClient", + "target": "2784" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Configuration for the key-value store." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 3832, + "kind": 32768, + "kindString": "Parameter", + "name": "to_kvs_configuration", + "type": { + "name": "Configuration | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Configuration", + "target": "235" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Additional parameters for the export operation, specific to the chosen content type." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 3833, + "kind": 32768, + "kindString": "Parameter", + "name": "kwargs", + "type": { + "name": "Any", + "type": "reference" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Export the entire dataset into a specified file stored under a key in a key-value store.\n\nThis method consolidates all entries from a specified dataset into one file, which is then saved under a\ngiven key in a key-value store. The format of the exported file is determined by the `content_type` parameter.\nEither the dataset's ID or name should be specified, and similarly, either the target key-value store's ID or\nname should be used.\n" + } + ] + }, + "flags": {}, + "id": 3834, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "export_to", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The key under which to save the data in the key-value store." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 3835, + "kind": 32768, + "kindString": "Parameter", + "name": "key", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The format in which to export the data." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 3836, + "kind": 32768, + "kindString": "Parameter", + "name": "content_type", + "type": { + "name": "Literal", + "type": "reference", + "typeArguments": [ + { + "type": "literal", + "value": "json" + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "ID of the key-value store to save the exported file.\nSpecify only one of ID or name." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 3837, + "kind": 32768, + "kindString": "Parameter", + "name": "to_kvs_id", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Name of the key-value store to save the exported file.\nSpecify only one of ID or name." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 3838, + "kind": 32768, + "kindString": "Parameter", + "name": "to_kvs_name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Storage client to use for the key-value store." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 3839, + "kind": 32768, + "kindString": "Parameter", + "name": "to_kvs_storage_client", + "type": { + "name": "StorageClient | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "StorageClient", + "target": "2784" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Configuration for the key-value store." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 3840, + "kind": 32768, + "kindString": "Parameter", + "name": "to_kvs_configuration", + "type": { + "name": "Configuration | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Configuration", + "target": "235" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True (default: False), dict keys that are not of a basic type (str, int, float, bool, None) will be skipped\ninstead of raising a `TypeError`." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 543, + "module": "_types", + "name": "skipkeys", + "parsedDocstring": { + "text": "If True (default: False), dict keys that are not of a basic type (str, int, float, bool, None) will be skipped\ninstead of raising a `TypeError`." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 753 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Determines if non-ASCII characters should be escaped in the output JSON string." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 544, + "module": "_types", + "name": "ensure_ascii", + "parsedDocstring": { + "text": "Determines if non-ASCII characters should be escaped in the output JSON string." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 757 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If False (default: True), skips the circular reference check for container types. A circular reference will\nresult in a `RecursionError` or worse if unchecked." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 545, + "module": "_types", + "name": "check_circular", + "parsedDocstring": { + "text": "If False (default: True), skips the circular reference check for container types. A circular reference will\nresult in a `RecursionError` or worse if unchecked." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 760 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "If False (default: True), raises a ValueError for out-of-range float values (nan, inf, -inf) to strictly comply\nwith the JSON specification. If True, uses their JavaScript equivalents (NaN, Infinity, -Infinity)." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 546, + "module": "_types", + "name": "allow_nan", + "parsedDocstring": { + "text": "If False (default: True), raises a ValueError for out-of-range float values (nan, inf, -inf) to strictly comply\nwith the JSON specification. If True, uses their JavaScript equivalents (NaN, Infinity, -Infinity)." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 764 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Allows specifying a custom JSON encoder." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 547, + "module": "_types", + "name": "cls", + "parsedDocstring": { + "text": "Allows specifying a custom JSON encoder." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 768 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "type", + "typeArguments": [ + { + "type": "reference", + "name": "json.JSONEncoder" + } + ], + "target": "981" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Specifies the number of spaces for indentation in the pretty-printed JSON output." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 548, + "module": "_types", + "name": "indent", + "parsedDocstring": { + "text": "Specifies the number of spaces for indentation in the pretty-printed JSON output." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 771 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A tuple of (item_separator, key_separator). The default is (', ', ': ') if indent is None and (',', ': ')\notherwise." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 549, + "module": "_types", + "name": "separators", + "parsedDocstring": { + "text": "A tuple of (item_separator, key_separator). The default is (', ', ': ') if indent is None and (',', ': ')\notherwise." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 774 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "tuple", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "str" + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A function called for objects that can't be serialized otherwise. It should return a JSON-encodable version\nof the object or raise a `TypeError`." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 550, + "module": "_types", + "name": "default", + "parsedDocstring": { + "text": "A function called for objects that can't be serialized otherwise. It should return a JSON-encodable version\nof the object or raise a `TypeError`." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 778 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "Callable" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Specifies whether the output JSON object should have keys sorted alphabetically." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 551, + "module": "_types", + "name": "sort_keys", + "parsedDocstring": { + "text": "Specifies whether the output JSON object should have keys sorted alphabetically." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 782 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Export the entire dataset into a specified file stored under a key in a key-value store.\n\nThis method consolidates all entries from a specified dataset into one file, which is then saved under a\ngiven key in a key-value store. The format of the exported file is determined by the `content_type` parameter.\nEither the dataset's ID or name should be specified, and similarly, either the target key-value store's ID or\nname should be used.\n" + } + ] + }, + "flags": {}, + "id": 3842, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "export_to", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The key under which to save the data in the key-value store." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 3843, + "kind": 32768, + "kindString": "Parameter", + "name": "key", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The format in which to export the data." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 3844, + "kind": 32768, + "kindString": "Parameter", + "name": "content_type", + "type": { + "name": "Literal", + "type": "reference", + "typeArguments": [ + { + "type": "literal", + "value": "csv" + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "ID of the key-value store to save the exported file.\nSpecify only one of ID or name." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 3845, + "kind": 32768, + "kindString": "Parameter", + "name": "to_kvs_id", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Name of the key-value store to save the exported file.\nSpecify only one of ID or name." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 3846, + "kind": 32768, + "kindString": "Parameter", + "name": "to_kvs_name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Storage client to use for the key-value store." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 3847, + "kind": 32768, + "kindString": "Parameter", + "name": "to_kvs_storage_client", + "type": { + "name": "StorageClient | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "StorageClient", + "target": "2784" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Configuration for the key-value store." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 3848, + "kind": 32768, + "kindString": "Parameter", + "name": "to_kvs_configuration", + "type": { + "name": "Configuration | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Configuration", + "target": "235" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Specifies a dialect to be used in CSV parsing and writing." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 553, + "module": "_types", + "name": "dialect", + "parsedDocstring": { + "text": "Specifies a dialect to be used in CSV parsing and writing." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 789 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A one-character string used to separate fields. Defaults to ','." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 554, + "module": "_types", + "name": "delimiter", + "parsedDocstring": { + "text": "A one-character string used to separate fields. Defaults to ','." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 792 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Controls how instances of `quotechar` inside a field should be quoted. When True, the character is doubled;\nwhen False, the `escapechar` is used as a prefix. Defaults to True." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 555, + "module": "_types", + "name": "doublequote", + "parsedDocstring": { + "text": "Controls how instances of `quotechar` inside a field should be quoted. When True, the character is doubled;\nwhen False, the `escapechar` is used as a prefix. Defaults to True." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 795 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A one-character string used to escape the delimiter if `quoting` is set to `QUOTE_NONE` and the `quotechar`\nif `doublequote` is False. Defaults to None, disabling escaping." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 556, + "module": "_types", + "name": "escapechar", + "parsedDocstring": { + "text": "A one-character string used to escape the delimiter if `quoting` is set to `QUOTE_NONE` and the `quotechar`\nif `doublequote` is False. Defaults to None, disabling escaping." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 799 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "The string used to terminate lines produced by the writer. Defaults to '\\r\\n'." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 557, + "module": "_types", + "name": "lineterminator", + "parsedDocstring": { + "text": "The string used to terminate lines produced by the writer. Defaults to '\\r\\n'." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 803 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "A one-character string used to quote fields containing special characters, like the delimiter or quotechar,\nor fields containing new-line characters. Defaults to '\"'." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 558, + "module": "_types", + "name": "quotechar", + "parsedDocstring": { + "text": "A one-character string used to quote fields containing special characters, like the delimiter or quotechar,\nor fields containing new-line characters. Defaults to '\"'." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 806 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "str" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Controls when quotes should be generated by the writer and recognized by the reader. Can take any of\nthe `QUOTE_*` constants, with a default of `QUOTE_MINIMAL`." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 559, + "module": "_types", + "name": "quoting", + "parsedDocstring": { + "text": "Controls when quotes should be generated by the writer and recognized by the reader. Can take any of\nthe `QUOTE_*` constants, with a default of `QUOTE_MINIMAL`." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 810 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "int" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "When True, spaces immediately following the delimiter are ignored. Defaults to False." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 560, + "module": "_types", + "name": "skipinitialspace", + "parsedDocstring": { + "text": "When True, spaces immediately following the delimiter are ignored. Defaults to False." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 814 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "When True, raises an exception on bad CSV input. Defaults to False." + } + ] + }, + "flags": { + "keyword-only": true, + "optional": true + }, + "groups": [], + "id": 561, + "module": "_types", + "name": "strict", + "parsedDocstring": { + "text": "When True, raises an exception on bad CSV input. Defaults to False." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/_types.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 817 + } + ], + "type": { + "name": "NotRequired", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "bool" + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Dataset is a storage for managing structured tabular data.\n\nThe dataset class provides a high-level interface for storing and retrieving structured data\nwith consistent schema, similar to database tables or spreadsheets. It abstracts the underlying\nstorage implementation details, offering a consistent API regardless of where the data is\nphysically stored.\n\nDataset operates in an append-only mode, allowing new records to be added but not modified\nor deleted after creation. This makes it particularly suitable for storing crawling results\nand other data that should be immutable once collected.\n\nThe class provides methods for adding data, retrieving data with various filtering options,\nand exporting data to different formats. You can create a dataset using the `open` class method,\nspecifying either a name or ID. The underlying storage implementation is determined by\nthe configured storage client.\n\n### Usage\n\n```python\nfrom crawlee.storages import Dataset\n\n# Open a dataset\ndataset = await Dataset.open(name='my-dataset')\n\n# Add data\nawait dataset.push_data({'title': 'Example Product', 'price': 99.99})\n\n# Retrieve filtered data\nresults = await dataset.get_data(limit=10, desc=True)\n\n# Export data\nawait dataset.export_to('results.json', content_type='json')\n```" + } + ] + }, + "decorations": [ + { + "args": "('Storages')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 3767, + 3783, + 3825, + 3790, + 3774, + 3803, + 3814, + 3776, + 3785, + 3787 + ], + "title": "Methods" + }, + { + "children": [ + 3772, + 3773 + ], + "title": "Properties" + } + ], + "id": 3766, + "module": "storages._dataset", + "name": "Dataset", + "parsedDocstring": { + "text": "Dataset is a storage for managing structured tabular data.\n\nThe dataset class provides a high-level interface for storing and retrieving structured data\nwith consistent schema, similar to database tables or spreadsheets. It abstracts the underlying\nstorage implementation details, offering a consistent API regardless of where the data is\nphysically stored.\n\nDataset operates in an append-only mode, allowing new records to be added but not modified\nor deleted after creation. This makes it particularly suitable for storing crawling results\nand other data that should be immutable once collected.\n\nThe class provides methods for adding data, retrieving data with various filtering options,\nand exporting data to different formats. You can create a dataset using the `open` class method,\nspecifying either a name or ID. The underlying storage implementation is determined by\nthe configured storage client.\n\n### Usage\n\n```python\nfrom crawlee.storages import Dataset\n\n# Open a dataset\ndataset = await Dataset.open(name='my-dataset')\n\n# Add data\nawait dataset.push_data({'title': 'Example Product', 'price': 99.99})\n\n# Retrieve filtered data\nresults = await dataset.get_data(limit=10, desc=True)\n\n# Export data\nawait dataset.export_to('results.json', content_type='json')\n```" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storages/_dataset.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 33 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "Storage", + "target": "3680", + "type": "reference" + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3850, + "module": "storages._request_queue", + "name": "logger", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storages/_request_queue.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 27 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3851, + "module": "storages._request_queue", + "name": "T", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storages/_request_queue.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 29 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n\nPreferably use the `RequestQueue.open` constructor to create a new instance.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3853, + "module": "storages._request_queue", + "name": "__init__", + "parsedDocstring": { + "text": "Initialize a new instance.\n\nPreferably use the `RequestQueue.open` constructor to create a new instance.\n", + "args": { + "client": "An instance of a storage client.", + "id": "The unique identifier of the storage.", + "name": "The name of the storage, if available." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storages/_request_queue.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 74 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Initialize a new instance.\n\nPreferably use the `RequestQueue.open` constructor to create a new instance.\n" + } + ] + }, + "flags": {}, + "id": 3854, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "An instance of a storage client." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 3855, + "kind": 32768, + "kindString": "Parameter", + "name": "client", + "type": { + "name": "RequestQueueClient", + "type": "reference", + "target": "2759" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The unique identifier of the storage." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 3856, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The name of the storage, if available." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 3857, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the storage ID." + } + ] + }, + "decorations": [ + { + "name": "property" + }, + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3858, + "module": "storages._base", + "name": "id", + "parsedDocstring": { + "text": "Get the storage ID." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storages/_request_queue.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 95 + } + ], + "type": { + "name": "str", + "type": "reference" + }, + "overwrites": { + "name": "Storage.id", + "target": 3681, + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the storage name." + } + ] + }, + "decorations": [ + { + "name": "property" + }, + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3859, + "module": "storages._base", + "name": "name", + "parsedDocstring": { + "text": "Get the storage name." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storages/_request_queue.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 100 + } + ], + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + }, + "overwrites": { + "name": "Storage.name", + "target": 3682, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the storage metadata." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3860, + "module": "storages._base", + "name": "get_metadata", + "parsedDocstring": { + "text": "Get the storage metadata." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storages/_request_queue.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 104 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the storage metadata." + } + ] + }, + "flags": {}, + "id": 3684, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_metadata", + "parameters": [], + "type": { + "name": "DatasetMetadata | KeyValueStoreMetadata | RequestQueueMetadata", + "type": "union", + "types": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "DatasetMetadata", + "target": "3639" + }, + { + "type": "reference", + "name": "KeyValueStoreMetadata", + "target": "3642" + } + ] + }, + { + "type": "reference", + "name": "RequestQueueMetadata", + "target": "3644" + } + ] + }, + "overwrites": { + "name": "Storage.get_metadata", + "target": 3683, + "type": "reference" + } + } + ], + "overwrites": { + "name": "Storage.get_metadata", + "target": 3683, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the number of requests in the loader that have been handled." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3862, + "module": "request_loaders._request_loader", + "name": "get_handled_count", + "parsedDocstring": { + "text": "Get the number of requests in the loader that have been handled." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storages/_request_queue.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 108 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get the number of requests in the loader that have been handled." + } + ] + }, + "flags": {}, + "id": 2275, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_handled_count", + "parameters": [], + "type": { + "name": "int", + "type": "reference" + }, + "overwrites": { + "name": "RequestManager.get_handled_count", + "target": 3995, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RequestManager.get_handled_count", + "target": 3995, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get an offline approximation of the total number of requests in the loader (i.e. pending + handled)." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3864, + "module": "request_loaders._request_loader", + "name": "get_total_count", + "parsedDocstring": { + "text": "Get an offline approximation of the total number of requests in the loader (i.e. pending + handled)." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storages/_request_queue.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 113 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Get an offline approximation of the total number of requests in the loader (i.e. pending + handled)." + } + ] + }, + "flags": {}, + "id": 2277, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_total_count", + "parameters": [], + "type": { + "name": "int", + "type": "reference" + }, + "overwrites": { + "name": "RequestManager.get_total_count", + "target": 3996, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RequestManager.get_total_count", + "target": 3996, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Open a storage, either restore existing or create a new one.\n" + } + ] + }, + "decorations": [ + { + "name": "classmethod" + }, + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3866, + "module": "storages._base", + "name": "open", + "parsedDocstring": { + "text": "Open a storage, either restore existing or create a new one.\n", + "args": { + "id": "The storage ID.", + "name": "The storage name (global scope, persists across runs). Name can only contain letters \"a\" through \"z\",\nthe digits \"0\" through \"9\", and the hyphen (\"-\") but only in the middle of the string\n(e.g. \"my-value-1\").", + "alias": "The storage alias (run scope, creates unnamed storage).", + "configuration": "Configuration object used during the storage creation or restoration process.", + "storage_client": "Underlying storage client to use. If not provided, the default global storage client\nfrom the service locator will be used." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storages/_request_queue.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 119 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Open a storage, either restore existing or create a new one.\n" + } + ] + }, + "flags": {}, + "id": 3686, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "open", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The storage ID." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 3687, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The storage name (global scope, persists across runs). Name can only contain letters \"a\" through \"z\",\nthe digits \"0\" through \"9\", and the hyphen (\"-\") but only in the middle of the string\n(e.g. \"my-value-1\")." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 3688, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The storage alias (run scope, creates unnamed storage)." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 3689, + "kind": 32768, + "kindString": "Parameter", + "name": "alias", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Configuration object used during the storage creation or restoration process." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 3690, + "kind": 32768, + "kindString": "Parameter", + "name": "configuration", + "type": { + "name": "Configuration | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Configuration", + "target": "235" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Underlying storage client to use. If not provided, the default global storage client\nfrom the service locator will be used." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 3691, + "kind": 32768, + "kindString": "Parameter", + "name": "storage_client", + "type": { + "name": "StorageClient | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "StorageClient", + "target": "2784" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "Storage", + "type": "reference", + "target": "3680" + }, + "overwrites": { + "name": "Storage.open", + "target": 3685, + "type": "reference" + } + } + ], + "overwrites": { + "name": "Storage.open", + "target": 3685, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Drop the storage, removing it from the underlying storage client and clearing the cache." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3873, + "module": "storages._base", + "name": "drop", + "parsedDocstring": { + "text": "Drop the storage, removing it from the underlying storage client and clearing the cache." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storages/_request_queue.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 144 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Drop the storage, removing it from the underlying storage client and clearing the cache." + } + ] + }, + "flags": {}, + "id": 3693, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "drop", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "RequestManager.drop", + "target": 2340, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RequestManager.drop", + "target": 2340, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Purge the storage, removing all items from the underlying storage client.\n\nThis method does not remove the storage itself, e.g. don't remove the metadata,\nbut clears all items within it." + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3875, + "module": "storages._base", + "name": "purge", + "parsedDocstring": { + "text": "Purge the storage, removing all items from the underlying storage client.\n\nThis method does not remove the storage itself, e.g. don't remove the metadata,\nbut clears all items within it." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storages/_request_queue.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 152 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Purge the storage, removing all items from the underlying storage client.\n\nThis method does not remove the storage itself, e.g. don't remove the metadata,\nbut clears all items within it." + } + ] + }, + "flags": {}, + "id": 3695, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "purge", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "Storage.purge", + "target": 3694, + "type": "reference" + } + } + ], + "overwrites": { + "name": "Storage.purge", + "target": 3694, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Add a single request to the manager and store it in underlying resource client.\n" + } + ] + }, + "decorations": [ + { + "name": "abstractmethod" + } + ], + "flags": {}, + "groups": [], + "id": 3877, + "module": "request_loaders._request_manager", + "name": "add_request", + "parsedDocstring": { + "text": "Add a single request to the manager and store it in underlying resource client.\n", + "args": { + "request": "The request object (or its string representation) to be added to the manager.", + "forefront": "Determines whether the request should be added to the beginning (if True) or the end (if False)\nof the manager.\n" + }, + "returns": "Information about the request addition to the manager or None if the request was not added." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storages/_request_queue.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 156 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "Information about the request addition to the manager or None if the request was not added." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Add a single request to the manager and store it in underlying resource client.\n" + } + ] + }, + "flags": {}, + "id": 2343, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "add_request", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The request object (or its string representation) to be added to the manager." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2344, + "kind": 32768, + "kindString": "Parameter", + "name": "request", + "type": { + "name": "str | Request", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Request", + "target": "150" + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Determines whether the request should be added to the beginning (if True) or the end (if False)\nof the manager.\n" + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2345, + "kind": 32768, + "kindString": "Parameter", + "name": "forefront", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "ProcessedRequest | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ProcessedRequest", + "target": "3665" + }, + { + "type": "literal", + "value": null + } + ] + }, + "overwrites": { + "name": "RequestManager.add_request", + "target": 2342, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RequestManager.add_request", + "target": 2342, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Add requests to the manager in batches.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3881, + "module": "request_loaders._request_manager", + "name": "add_requests", + "parsedDocstring": { + "text": "Add requests to the manager in batches.\n", + "args": { + "requests": "Requests to enqueue.", + "forefront": "If True, add requests to the beginning of the queue.", + "batch_size": "The number of requests to add in one batch.", + "wait_time_between_batches": "Time to wait between adding batches.", + "wait_for_all_requests_to_be_added": "If True, wait for all requests to be added before returning.", + "wait_for_all_requests_to_be_added_timeout": "Timeout for waiting for all requests to be added." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storages/_request_queue.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 180 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Add requests to the manager in batches.\n" + } + ] + }, + "flags": {}, + "id": 2347, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "add_requests", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Requests to enqueue." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 2348, + "kind": 32768, + "kindString": "Parameter", + "name": "requests", + "type": { + "name": "Sequence", + "type": "reference", + "typeArguments": [ + { + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "Request", + "target": "150" + } + ] + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, add requests to the beginning of the queue." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2349, + "kind": 32768, + "kindString": "Parameter", + "name": "forefront", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The number of requests to add in one batch." + } + ] + }, + "defaultValue": "1000", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2350, + "kind": 32768, + "kindString": "Parameter", + "name": "batch_size", + "type": { + "name": "int", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Time to wait between adding batches." + } + ] + }, + "defaultValue": "timedelta(seconds=1)", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2351, + "kind": 32768, + "kindString": "Parameter", + "name": "wait_time_between_batches", + "type": { + "name": "timedelta", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "If True, wait for all requests to be added before returning." + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2352, + "kind": 32768, + "kindString": "Parameter", + "name": "wait_for_all_requests_to_be_added", + "type": { + "name": "bool", + "type": "reference" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Timeout for waiting for all requests to be added." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 2353, + "kind": 32768, + "kindString": "Parameter", + "name": "wait_for_all_requests_to_be_added_timeout", + "type": { + "name": "timedelta | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "timedelta" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + }, + "overwrites": { + "name": "RequestManager.add_requests", + "target": 2346, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RequestManager.add_requests", + "target": 2346, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Return the next request in the queue to be processed.\n\nOnce you successfully finish processing of the request, you need to call `RequestQueue.mark_request_as_handled`\nto mark the request as handled in the queue. If there was some error in processing the request, call\n`RequestQueue.reclaim_request` instead, so that the queue will give the request to some other consumer\nin another call to the `fetch_next_request` method.\n\nNote that the `None` return value does not mean the queue processing finished, it means there are currently\nno pending requests. To check whether all requests in queue were finished, use `RequestQueue.is_finished`\ninstead.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3889, + "module": "storages._request_queue", + "name": "fetch_next_request", + "parsedDocstring": { + "text": "Return the next request in the queue to be processed.\n\nOnce you successfully finish processing of the request, you need to call `RequestQueue.mark_request_as_handled`\nto mark the request as handled in the queue. If there was some error in processing the request, call\n`RequestQueue.reclaim_request` instead, so that the queue will give the request to some other consumer\nin another call to the `fetch_next_request` method.\n\nNote that the `None` return value does not mean the queue processing finished, it means there are currently\nno pending requests. To check whether all requests in queue were finished, use `RequestQueue.is_finished`\ninstead.\n", + "returns": "The next request to process, or `None` if there are no more pending requests." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storages/_request_queue.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 230 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "The next request to process, or `None` if there are no more pending requests." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Return the next request in the queue to be processed.\n\nOnce you successfully finish processing of the request, you need to call `RequestQueue.mark_request_as_handled`\nto mark the request as handled in the queue. If there was some error in processing the request, call\n`RequestQueue.reclaim_request` instead, so that the queue will give the request to some other consumer\nin another call to the `fetch_next_request` method.\n\nNote that the `None` return value does not mean the queue processing finished, it means there are currently\nno pending requests. To check whether all requests in queue were finished, use `RequestQueue.is_finished`\ninstead.\n" + } + ] + }, + "flags": {}, + "id": 3890, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "fetch_next_request", + "parameters": [], + "type": { + "name": "Request | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Request", + "target": "150" + }, + { + "type": "literal", + "value": null + } + ] + }, + "overwrites": { + "name": "RequestManager.fetch_next_request", + "target": 3999, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RequestManager.fetch_next_request", + "target": 3999, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Retrieve a specific request from the queue by its ID.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3891, + "module": "storages._request_queue", + "name": "get_request", + "parsedDocstring": { + "text": "Retrieve a specific request from the queue by its ID.\n", + "args": { + "unique_key": "Unique key of the request to retrieve.\n" + }, + "returns": "The request with the specified ID, or `None` if no such request exists." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storages/_request_queue.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 247 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "The request with the specified ID, or `None` if no such request exists." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Retrieve a specific request from the queue by its ID.\n" + } + ] + }, + "flags": {}, + "id": 3892, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "get_request", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Unique key of the request to retrieve.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 3893, + "kind": 32768, + "kindString": "Parameter", + "name": "unique_key", + "type": { + "name": "str", + "type": "reference" + } + } + ], + "type": { + "name": "Request | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "Request", + "target": "150" + }, + { + "type": "literal", + "value": null + } + ] + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Mark a request as handled after successful processing.\n\nThis method should be called after a request has been successfully processed.\nOnce marked as handled, the request will be removed from the queue and will\nnot be returned in subsequent calls to `fetch_next_request` method.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3894, + "module": "storages._request_queue", + "name": "mark_request_as_handled", + "parsedDocstring": { + "text": "Mark a request as handled after successful processing.\n\nThis method should be called after a request has been successfully processed.\nOnce marked as handled, the request will be removed from the queue and will\nnot be returned in subsequent calls to `fetch_next_request` method.\n", + "args": { + "request": "The request to mark as handled.\n" + }, + "returns": "Information about the queue operation." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storages/_request_queue.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 258 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "Information about the queue operation." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Mark a request as handled after successful processing.\n\nThis method should be called after a request has been successfully processed.\nOnce marked as handled, the request will be removed from the queue and will\nnot be returned in subsequent calls to `fetch_next_request` method.\n" + } + ] + }, + "flags": {}, + "id": 3895, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "mark_request_as_handled", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The request to mark as handled.\n" + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 3896, + "kind": 32768, + "kindString": "Parameter", + "name": "request", + "type": { + "name": "Request", + "type": "reference", + "target": "150" + } + } + ], + "type": { + "name": "ProcessedRequest | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ProcessedRequest", + "target": "3665" + }, + { + "type": "literal", + "value": null + } + ] + }, + "overwrites": { + "name": "RequestManager.mark_request_as_handled", + "target": 4000, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RequestManager.mark_request_as_handled", + "target": 4000, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Reclaim a failed request back to the queue for later processing.\n\nIf a request fails during processing, this method can be used to return it to the queue.\nThe request will be returned for processing again in a subsequent call\nto `RequestQueue.fetch_next_request`.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3897, + "module": "storages._request_queue", + "name": "reclaim_request", + "parsedDocstring": { + "text": "Reclaim a failed request back to the queue for later processing.\n\nIf a request fails during processing, this method can be used to return it to the queue.\nThe request will be returned for processing again in a subsequent call\nto `RequestQueue.fetch_next_request`.\n", + "args": { + "request": "The request to return to the queue.", + "forefront": "If true, the request will be added to the beginning of the queue.\nOtherwise, it will be added to the end.\n" + }, + "returns": "Information about the queue operation." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storages/_request_queue.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 273 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "Information about the queue operation." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Reclaim a failed request back to the queue for later processing.\n\nIf a request fails during processing, this method can be used to return it to the queue.\nThe request will be returned for processing again in a subsequent call\nto `RequestQueue.fetch_next_request`.\n" + } + ] + }, + "flags": {}, + "id": 3898, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "reclaim_request", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The request to return to the queue." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 3899, + "kind": 32768, + "kindString": "Parameter", + "name": "request", + "type": { + "name": "Request", + "type": "reference", + "target": "150" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "If true, the request will be added to the beginning of the queue.\nOtherwise, it will be added to the end.\n" + } + ] + }, + "defaultValue": "False", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 3900, + "kind": 32768, + "kindString": "Parameter", + "name": "forefront", + "type": { + "name": "bool", + "type": "reference" + } + } + ], + "type": { + "name": "ProcessedRequest | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "ProcessedRequest", + "target": "3665" + }, + { + "type": "literal", + "value": null + } + ] + }, + "overwrites": { + "name": "RequestManager.reclaim_request", + "target": 2354, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RequestManager.reclaim_request", + "target": 2354, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Check if the request queue is empty.\n\nAn empty queue means that there are no requests currently in the queue, either pending or being processed.\nHowever, this does not necessarily mean that the crawling operation is finished, as there still might be\ntasks that could add additional requests to the queue.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3901, + "module": "storages._request_queue", + "name": "is_empty", + "parsedDocstring": { + "text": "Check if the request queue is empty.\n\nAn empty queue means that there are no requests currently in the queue, either pending or being processed.\nHowever, this does not necessarily mean that the crawling operation is finished, as there still might be\ntasks that could add additional requests to the queue.\n", + "returns": "True if the request queue is empty, False otherwise." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storages/_request_queue.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 295 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "True if the request queue is empty, False otherwise." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Check if the request queue is empty.\n\nAn empty queue means that there are no requests currently in the queue, either pending or being processed.\nHowever, this does not necessarily mean that the crawling operation is finished, as there still might be\ntasks that could add additional requests to the queue.\n" + } + ] + }, + "flags": {}, + "id": 3902, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "is_empty", + "parameters": [], + "type": { + "name": "bool", + "type": "reference" + }, + "overwrites": { + "name": "RequestManager.is_empty", + "target": 3997, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RequestManager.is_empty", + "target": 3997, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Check if the request queue is finished.\n\nA finished queue means that all requests in the queue have been processed (the queue is empty) and there\nare no more tasks that could add additional requests to the queue. This is the definitive way to check\nif a crawling operation is complete.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3903, + "module": "storages._request_queue", + "name": "is_finished", + "parsedDocstring": { + "text": "Check if the request queue is finished.\n\nA finished queue means that all requests in the queue have been processed (the queue is empty) and there\nare no more tasks that could add additional requests to the queue. This is the definitive way to check\nif a crawling operation is complete.\n", + "returns": "True if the request queue is finished (empty and no pending add operations), False otherwise." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storages/_request_queue.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 307 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "True if the request queue is finished (empty and no pending add operations), False otherwise." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Check if the request queue is finished.\n\nA finished queue means that all requests in the queue have been processed (the queue is empty) and there\nare no more tasks that could add additional requests to the queue. This is the definitive way to check\nif a crawling operation is complete.\n" + } + ] + }, + "flags": {}, + "id": 3904, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "is_finished", + "parameters": [], + "type": { + "name": "bool", + "type": "reference" + }, + "overwrites": { + "name": "RequestManager.is_finished", + "target": 3998, + "type": "reference" + } + } + ], + "overwrites": { + "name": "RequestManager.is_finished", + "target": 3998, + "type": "reference" + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Combine the loader with a request manager to support adding and reclaiming requests.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 4004, + "module": "request_loaders._request_loader", + "name": "to_tandem", + "parsedDocstring": { + "text": "Combine the loader with a request manager to support adding and reclaiming requests.\n", + "args": { + "request_manager": "Request manager to combine the loader with.\nIf None is given, the default request queue is used." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/request_loaders/_request_loader.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 56 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Combine the loader with a request manager to support adding and reclaiming requests.\n" + } + ] + }, + "flags": {}, + "id": 2288, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "to_tandem", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Request manager to combine the loader with.\nIf None is given, the default request queue is used." + } + ] + }, + "defaultValue": "None", + "flags": { + "isOptional": true, + "keyword-only": false + }, + "id": 2289, + "kind": 32768, + "kindString": "Parameter", + "name": "request_manager", + "type": { + "name": "RequestManager | None", + "type": "reference" + } + } + ], + "type": { + "name": "RequestManagerTandem", + "type": "reference", + "target": "2359" + }, + "inheritedFrom": { + "name": "RequestLoader.to_tandem", + "target": 2287, + "type": "reference" + } + } + ], + "inheritedFrom": { + "name": "RequestLoader.to_tandem", + "target": 2287, + "type": "reference" + } + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Request queue is a storage for managing HTTP requests.\n\nThe request queue class serves as a high-level interface for organizing and managing HTTP requests\nduring web crawling. It provides methods for adding, retrieving, and manipulating requests throughout\nthe crawling lifecycle, abstracting away the underlying storage implementation details.\n\nRequest queue maintains the state of each URL to be crawled, tracking whether it has been processed,\nis currently being handled, or is waiting in the queue. Each URL in the queue is uniquely identified\nby a `unique_key` property, which prevents duplicate processing unless explicitly configured otherwise.\n\nThe class supports both breadth-first and depth-first crawling strategies through its `forefront` parameter\nwhen adding requests. It also provides mechanisms for error handling and request reclamation when\nprocessing fails.\n\nYou can open a request queue using the `open` class method, specifying either a name or ID to identify\nthe queue. The underlying storage implementation is determined by the configured storage client.\n\n### Usage\n\n```python\nfrom crawlee.storages import RequestQueue\n\n# Open a request queue\nrq = await RequestQueue.open(name='my-queue')\n\n# Add a request\nawait rq.add_request('https://example.com')\n\n# Process requests\nrequest = await rq.fetch_next_request()\nif request:\n try:\n # Process the request\n # ...\n await rq.mark_request_as_handled(request)\n except Exception:\n await rq.reclaim_request(request)\n```" + } + ] + }, + "decorations": [ + { + "args": "('Storages')", + "name": "docs_group" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 3853, + 3877, + 3881, + 3873, + 3889, + 3862, + 3860, + 3891, + 3864, + 3901, + 3903, + 3894, + 3866, + 3875, + 3897, + 4004 + ], + "title": "Methods" + }, + { + "children": [ + 3858, + 3859 + ], + "title": "Properties" + } + ], + "id": 3852, + "module": "storages._request_queue", + "name": "RequestQueue", + "parsedDocstring": { + "text": "Request queue is a storage for managing HTTP requests.\n\nThe request queue class serves as a high-level interface for organizing and managing HTTP requests\nduring web crawling. It provides methods for adding, retrieving, and manipulating requests throughout\nthe crawling lifecycle, abstracting away the underlying storage implementation details.\n\nRequest queue maintains the state of each URL to be crawled, tracking whether it has been processed,\nis currently being handled, or is waiting in the queue. Each URL in the queue is uniquely identified\nby a `unique_key` property, which prevents duplicate processing unless explicitly configured otherwise.\n\nThe class supports both breadth-first and depth-first crawling strategies through its `forefront` parameter\nwhen adding requests. It also provides mechanisms for error handling and request reclamation when\nprocessing fails.\n\nYou can open a request queue using the `open` class method, specifying either a name or ID to identify\nthe queue. The underlying storage implementation is determined by the configured storage client.\n\n### Usage\n\n```python\nfrom crawlee.storages import RequestQueue\n\n# Open a request queue\nrq = await RequestQueue.open(name='my-queue')\n\n# Add a request\nawait rq.add_request('https://example.com')\n\n# Process requests\nrequest = await rq.fetch_next_request()\nif request:\n try:\n # Process the request\n # ...\n await rq.mark_request_as_handled(request)\n except Exception:\n await rq.reclaim_request(request)\n```" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storages/_request_queue.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 33 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "extendedTypes": [ + { + "name": "Storage", + "target": "3680", + "type": "reference" + }, + { + "name": "RequestManager", + "target": "2339", + "type": "reference" + } + ] + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3905, + "module": "storages._storage_instance_manager", + "name": "T", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storages/_storage_instance_manager.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 18 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Cache for storage instances by ID. Example: by_id[Dataset]['some_id']['some_additional_cache_key']." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3907, + "module": "storages._storage_instance_manager", + "name": "by_id", + "parsedDocstring": { + "text": "Cache for storage instances by ID. Example: by_id[Dataset]['some_id']['some_additional_cache_key']." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storages/_storage_instance_manager.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 25 + } + ], + "type": { + "name": "defaultdict", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "type", + "typeArguments": [ + { + "type": "reference", + "name": "Storage", + "target": "3680" + } + ], + "target": "981" + }, + { + "type": "reference", + "name": "defaultdict", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "defaultdict", + "typeArguments": [ + { + "type": "reference", + "name": "Hashable" + }, + { + "type": "reference", + "name": "Storage", + "target": "3680" + } + ] + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Cache for storage instances by name. Example: by_name[Dataset]['some_name']['some_additional_cache_key']" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3908, + "module": "storages._storage_instance_manager", + "name": "by_name", + "parsedDocstring": { + "text": "Cache for storage instances by name. Example: by_name[Dataset]['some_name']['some_additional_cache_key']" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storages/_storage_instance_manager.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 30 + } + ], + "type": { + "name": "defaultdict", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "type", + "typeArguments": [ + { + "type": "reference", + "name": "Storage", + "target": "3680" + } + ], + "target": "981" + }, + { + "type": "reference", + "name": "defaultdict", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "defaultdict", + "typeArguments": [ + { + "type": "reference", + "name": "Hashable" + }, + { + "type": "reference", + "name": "Storage", + "target": "3680" + } + ] + } + ] + } + ] + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Cache for storage instances by alias. Example: by_alias[Dataset]['some_alias']['some_additional_cache_key']" + } + ] + }, + "flags": {}, + "groups": [], + "id": 3909, + "module": "storages._storage_instance_manager", + "name": "by_alias", + "parsedDocstring": { + "text": "Cache for storage instances by alias. Example: by_alias[Dataset]['some_alias']['some_additional_cache_key']" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storages/_storage_instance_manager.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 35 + } + ], + "type": { + "name": "defaultdict", + "type": "reference", + "typeArguments": [ + { + "type": "reference", + "name": "type", + "typeArguments": [ + { + "type": "reference", + "name": "Storage", + "target": "3680" + } + ], + "target": "981" + }, + { + "type": "reference", + "name": "defaultdict", + "typeArguments": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "reference", + "name": "defaultdict", + "typeArguments": [ + { + "type": "reference", + "name": "Hashable" + }, + { + "type": "reference", + "name": "Storage", + "target": "3680" + } + ] + } + ] + } + ] + } + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Remove a storage instance from the cache.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3910, + "module": "storages._storage_instance_manager", + "name": "remove_from_cache", + "parsedDocstring": { + "text": "Remove a storage instance from the cache.\n", + "args": { + "storage_instance": "The storage instance to remove." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storages/_storage_instance_manager.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 40 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Remove a storage instance from the cache.\n" + } + ] + }, + "flags": {}, + "id": 3911, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "remove_from_cache", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The storage instance to remove." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 3912, + "kind": 32768, + "kindString": "Parameter", + "name": "storage_instance", + "type": { + "name": "Storage", + "type": "reference", + "target": "3680" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Cache for storage instances." + } + ] + }, + "decorations": [ + { + "name": "dataclass" + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 3910 + ], + "title": "Methods" + }, + { + "children": [ + 3909, + 3907, + 3908 + ], + "title": "Properties" + } + ], + "id": 3906, + "module": "storages._storage_instance_manager", + "name": "_StorageCache", + "parsedDocstring": { + "text": "Cache for storage instances." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storages/_storage_instance_manager.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 22 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 1024, + "kindString": "Property", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Type alias for the client opener function." + } + ] + }, + "flags": {}, + "groups": [], + "id": 3913, + "module": "storages._storage_instance_manager", + "name": "ClientOpenerCoro", + "parsedDocstring": { + "text": "Type alias for the client opener function." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storages/_storage_instance_manager.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 65 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + }, + { + "kind": 128, + "kindString": "Class", + "children": [ + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3915, + "module": "storages._storage_instance_manager", + "name": "__init__", + "parsedDocstring": { + "text": "" + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storages/_storage_instance_manager.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 79 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "flags": {}, + "id": 3916, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "__init__", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Open a storage instance with caching support.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3917, + "module": "storages._storage_instance_manager", + "name": "open_storage_instance", + "parsedDocstring": { + "text": "Open a storage instance with caching support.\n", + "args": { + "cls": "The storage class to instantiate.", + "id": "Storage ID.", + "name": "Storage name. (global scope, persists across runs). Name can only contain letters \"a\" through \"z\",\nthe digits \"0\" through \"9\", and the hyphen (\"-\") but only in the middle of the string\n(e.g. \"my-value-1\").", + "alias": "Storage alias (run scope, creates unnamed storage).", + "client_opener_coro": "Coroutine to open the storage client when storage instance not found in cache.", + "storage_client_cache_key": "Additional optional key from storage client to differentiate cache entries.\n" + }, + "returns": "The storage instance." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storages/_storage_instance_manager.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 83 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "blockTags": [ + { + "content": [ + { + "kind": "text", + "text": "The storage instance." + } + ], + "tag": "@returns" + } + ], + "summary": [ + { + "kind": "text", + "text": "Open a storage instance with caching support.\n" + } + ] + }, + "flags": {}, + "id": 3918, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [ + "async" + ], + "name": "open_storage_instance", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Storage ID." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 3919, + "kind": 32768, + "kindString": "Parameter", + "name": "id", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Storage name. (global scope, persists across runs). Name can only contain letters \"a\" through \"z\",\nthe digits \"0\" through \"9\", and the hyphen (\"-\") but only in the middle of the string\n(e.g. \"my-value-1\")." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 3920, + "kind": 32768, + "kindString": "Parameter", + "name": "name", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Storage alias (run scope, creates unnamed storage)." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 3921, + "kind": 32768, + "kindString": "Parameter", + "name": "alias", + "type": { + "name": "str | None", + "type": "union", + "types": [ + { + "type": "reference", + "name": "str" + }, + { + "type": "literal", + "value": null + } + ] + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Coroutine to open the storage client when storage instance not found in cache." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": true + }, + "id": 3922, + "kind": 32768, + "kindString": "Parameter", + "name": "client_opener_coro", + "type": { + "name": "ClientOpenerCoro", + "type": "reference", + "target": "3913" + } + }, + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Additional optional key from storage client to differentiate cache entries.\n" + } + ] + }, + "defaultValue": "''", + "flags": { + "isOptional": true, + "keyword-only": true + }, + "id": 3923, + "kind": 32768, + "kindString": "Parameter", + "name": "storage_client_cache_key", + "type": { + "name": "Hashable", + "type": "reference" + } + } + ], + "type": { + "name": "T", + "type": "reference", + "target": "299" + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Remove a storage instance from the cache.\n" + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3924, + "module": "storages._storage_instance_manager", + "name": "remove_from_cache", + "parsedDocstring": { + "text": "Remove a storage instance from the cache.\n", + "args": { + "storage_instance": "The storage instance to remove." + } + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storages/_storage_instance_manager.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 195 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Remove a storage instance from the cache.\n" + } + ] + }, + "flags": {}, + "id": 3925, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "remove_from_cache", + "parameters": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "The storage instance to remove." + } + ] + }, + "flags": { + "isOptional": false, + "keyword-only": false + }, + "id": 3926, + "kind": 32768, + "kindString": "Parameter", + "name": "storage_instance", + "type": { + "name": "Storage", + "type": "reference", + "target": "3680" + } + } + ], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + }, + { + "kind": 2048, + "kindString": "Method", + "children": [], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Clear all cached storage instances." + } + ] + }, + "decorations": [], + "flags": {}, + "groups": [], + "id": 3927, + "module": "storages._storage_instance_manager", + "name": "clear_cache", + "parsedDocstring": { + "text": "Clear all cached storage instances." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storages/_storage_instance_manager.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 203 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + }, + "signatures": [ + { + "comment": { + "summary": [ + { + "kind": "text", + "text": "Clear all cached storage instances." + } + ] + }, + "flags": {}, + "id": 3928, + "kind": 4096, + "kindString": "Call signature", + "modifiers": [], + "name": "clear_cache", + "parameters": [], + "type": { + "name": "None", + "type": "literal", + "value": null + } + } + ] + } + ], + "comment": { + "summary": [ + { + "kind": "text", + "text": "Manager for caching and managing storage instances.\n\nThis class centralizes the caching logic for all storage types (Dataset, KeyValueStore, RequestQueue)\nand provides a unified interface for opening and managing storage instances." + } + ] + }, + "flags": {}, + "groups": [ + { + "children": [ + 3915, + 3927, + 3917, + 3924 + ], + "title": "Methods" + } + ], + "id": 3914, + "module": "storages._storage_instance_manager", + "name": "StorageInstanceManager", + "parsedDocstring": { + "text": "Manager for caching and managing storage instances.\n\nThis class centralizes the caching logic for all storage types (Dataset, KeyValueStore, RequestQueue)\nand provides a unified interface for opening and managing storage instances." + }, + "sources": [ + { + "character": 1, + "fileName": "/src/crawlee/storages/_storage_instance_manager.py", + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea", + "line": 69 + } + ], + "type": { + "name": "Undefined", + "type": "reference" + } + } + ], + "flags": {}, + "groups": [ + { + "children": [ + 567, + 647, + 586 + ], + "title": "Autoscaling" + }, + { + "children": [ + 1073, + 1090, + 1167, + 1219, + 1141, + 1107 + ], + "title": "Browser management" + }, + { + "children": [ + 328, + 235, + 263, + 211 + ], + "title": "Configuration" + }, + { + "children": [ + 1266, + 1395, + 1555, + 1646, + 1685, + 1712, + 1815 + ], + "title": "Crawlers" + }, + { + "children": [ + 1442, + 1477, + 1470, + 504, + 1651, + 1245, + 2068, + 1252, + 1716, + 1842, + 1846, + 1779 + ], + "title": "Crawling contexts" + }, + { + "children": [ + 68, + 63, + 73, + 57, + 52, + 51, + 74, + 58, + 45, + 44, + 42 + ], + "title": "Errors" + }, + { + "children": [ + 1868, + 1891, + 1895, + 1893, + 1888, + 1881, + 1884 + ], + "title": "Event data" + }, + { + "children": [ + 1907, + 1853 + ], + "title": "Event managers" + }, + { + "children": [ + 422, + 1793, + 430, + 459, + 468, + 480, + 488, + 500 + ], + "title": "Functions" + }, + { + "children": [ + 2132, + 2070, + 2230, + 2179 + ], + "title": "HTTP clients" + }, + { + "children": [ + 1281, + 1661, + 1689, + 1726 + ], + "title": "HTTP parsers" + }, + { + "children": [ + 2039, + 1495, + 2266, + 1341, + 1979, + 1985, + 306, + 2060, + 495, + 254, + 1317, + 1320, + 134, + 78 + ], + "title": "Other" + }, + { + "children": [ + 2403, + 2273, + 2339, + 2359, + 2302 + ], + "title": "Request loaders" + }, + { + "children": [ + 2538, + 2445, + 2557, + 2498 + ], + "title": "Session management" + }, + { + "children": [ + 1304, + 2649, + 2707, + 2666 + ], + "title": "Statistics" + }, + { + "children": [ + 2870, + 3108, + 3222, + 3591, + 2784 + ], + "title": "Storage clients" + }, + { + "children": [ + 3676, + 3658, + 3639, + 3642, + 3655, + 3650, + 3665, + 150, + 3644, + 3632, + 3671 + ], + "title": "Storage data" + }, + { + "children": [ + 3766, + 3700, + 3852, + 3680 + ], + "title": "Storages" + } + ], + "id": 0, + "kind": 1, + "kindString": "Project", + "name": "apify-client", + "sources": [ + { + "character": 0, + "fileName": "src/index.ts", + "line": 1, + "gitRevision": "02a18ea618daf8c55d9ca272249b58aa45caa4ea" + } + ], + "symbolIdMap": { + "1": { + "qualifiedName": "cli", + "sourceFileName": "/src/crawlee/_cli.py" + }, + "2": { + "qualifiedName": "template_directory", + "sourceFileName": "/src/crawlee/_cli.py" + }, + "3": { + "qualifiedName": "crawler_choices", + "sourceFileName": "/src/crawlee/_cli.py" + }, + "4": { + "qualifiedName": "http_client_choices", + "sourceFileName": "/src/crawlee/_cli.py" + }, + "5": { + "qualifiedName": "package_manager_choices", + "sourceFileName": "/src/crawlee/_cli.py" + }, + "6": { + "qualifiedName": "default_start_url", + "sourceFileName": "/src/crawlee/_cli.py" + }, + "7": { + "qualifiedName": "default_enable_apify_integration", + "sourceFileName": "/src/crawlee/_cli.py" + }, + "8": { + "qualifiedName": "default_install_project", + "sourceFileName": "/src/crawlee/_cli.py" + }, + "9": { + "qualifiedName": "callback", + "sourceFileName": "/src/crawlee/_cli.py" + }, + "12": { + "qualifiedName": "create", + "sourceFileName": "/src/crawlee/_cli.py" + }, + "21": { + "qualifiedName": "METADATA_FILENAME", + "sourceFileName": "/src/crawlee/_consts.py" + }, + "22": { + "qualifiedName": "string_to_log_level", + "sourceFileName": "/src/crawlee/_log_config.py" + }, + "25": { + "qualifiedName": "get_configured_log_level", + "sourceFileName": "/src/crawlee/_log_config.py" + }, + "27": { + "qualifiedName": "configure_logger", + "sourceFileName": "/src/crawlee/_log_config.py" + }, + "31": { + "qualifiedName": "CrawleeLogFormatter", + "sourceFileName": "/src/crawlee/_log_config.py" + }, + "32": { + "qualifiedName": "empty_record", + "sourceFileName": "/src/crawlee/_log_config.py" + }, + "33": { + "qualifiedName": "__init__", + "sourceFileName": "/src/crawlee/_log_config.py" + }, + "38": { + "qualifiedName": "format", + "sourceFileName": "/src/crawlee/_log_config.py" + }, + "41": { + "qualifiedName": "TCrawlingContext", + "sourceFileName": "/src/crawlee/errors.py" + }, + "42": { + "qualifiedName": "UserDefinedErrorHandlerError", + "sourceFileName": "/src/crawlee/errors.py" + }, + "43": { + "qualifiedName": "UserHandlerTimeoutError", + "sourceFileName": "/src/crawlee/errors.py" + }, + "44": { + "qualifiedName": "SessionError", + "sourceFileName": "/src/crawlee/errors.py" + }, + "45": { + "qualifiedName": "ServiceConflictError", + "sourceFileName": "/src/crawlee/errors.py" + }, + "46": { + "qualifiedName": "__init__", + "sourceFileName": "/src/crawlee/errors.py" + }, + "51": { + "qualifiedName": "ProxyError", + "sourceFileName": "/src/crawlee/errors.py" + }, + "52": { + "qualifiedName": "HttpStatusCodeError", + "sourceFileName": "/src/crawlee/errors.py" + }, + "53": { + "qualifiedName": "__init__", + "sourceFileName": "/src/crawlee/errors.py" + }, + "57": { + "qualifiedName": "HttpClientStatusCodeError", + "sourceFileName": "/src/crawlee/errors.py" + }, + "58": { + "qualifiedName": "RequestHandlerError", + "sourceFileName": "/src/crawlee/errors.py" + }, + "59": { + "qualifiedName": "__init__", + "sourceFileName": "/src/crawlee/errors.py" + }, + "63": { + "qualifiedName": "ContextPipelineInitializationError", + "sourceFileName": "/src/crawlee/errors.py" + }, + "64": { + "qualifiedName": "__init__", + "sourceFileName": "/src/crawlee/errors.py" + }, + "68": { + "qualifiedName": "ContextPipelineFinalizationError", + "sourceFileName": "/src/crawlee/errors.py" + }, + "69": { + "qualifiedName": "__init__", + "sourceFileName": "/src/crawlee/errors.py" + }, + "73": { + "qualifiedName": "ContextPipelineInterruptedError", + "sourceFileName": "/src/crawlee/errors.py" + }, + "74": { + "qualifiedName": "RequestCollisionError", + "sourceFileName": "/src/crawlee/errors.py" + }, + "75": { + "qualifiedName": "__version__", + "sourceFileName": "/src/crawlee/__init__.py" + }, + "76": { + "qualifiedName": "TCrawlingContext", + "sourceFileName": "/src/crawlee/router.py" + }, + "77": { + "qualifiedName": "RequestHandler", + "sourceFileName": "/src/crawlee/router.py" + }, + "78": { + "qualifiedName": "Router", + "sourceFileName": "/src/crawlee/router.py" + }, + "79": { + "qualifiedName": "__init__", + "sourceFileName": "/src/crawlee/router.py" + }, + "81": { + "qualifiedName": "default_handler", + "sourceFileName": "/src/crawlee/router.py" + }, + "84": { + "qualifiedName": "handler", + "sourceFileName": "/src/crawlee/router.py" + }, + "87": { + "qualifiedName": "__call__", + "sourceFileName": "/src/crawlee/router.py" + }, + "90": { + "qualifiedName": "RequestState", + "sourceFileName": "/src/crawlee/_request.py" + }, + "91": { + "qualifiedName": "UNPROCESSED", + "sourceFileName": "/src/crawlee/_request.py" + }, + "92": { + "qualifiedName": "BEFORE_NAV", + "sourceFileName": "/src/crawlee/_request.py" + }, + "93": { + "qualifiedName": "AFTER_NAV", + "sourceFileName": "/src/crawlee/_request.py" + }, + "94": { + "qualifiedName": "REQUEST_HANDLER", + "sourceFileName": "/src/crawlee/_request.py" + }, + "95": { + "qualifiedName": "DONE", + "sourceFileName": "/src/crawlee/_request.py" + }, + "96": { + "qualifiedName": "ERROR_HANDLER", + "sourceFileName": "/src/crawlee/_request.py" + }, + "97": { + "qualifiedName": "ERROR", + "sourceFileName": "/src/crawlee/_request.py" + }, + "98": { + "qualifiedName": "SKIPPED", + "sourceFileName": "/src/crawlee/_request.py" + }, + "99": { + "qualifiedName": "CrawleeRequestData", + "sourceFileName": "/src/crawlee/_request.py" + }, + "100": { + "qualifiedName": "max_retries", + "sourceFileName": "/src/crawlee/_request.py" + }, + "101": { + "qualifiedName": "enqueue_strategy", + "sourceFileName": "/src/crawlee/_request.py" + }, + "102": { + "qualifiedName": "state", + "sourceFileName": "/src/crawlee/_request.py" + }, + "103": { + "qualifiedName": "session_rotation_count", + "sourceFileName": "/src/crawlee/_request.py" + }, + "104": { + "qualifiedName": "skip_navigation", + "sourceFileName": "/src/crawlee/_request.py" + }, + "105": { + "qualifiedName": "last_proxy_tier", + "sourceFileName": "/src/crawlee/_request.py" + }, + "106": { + "qualifiedName": "forefront", + "sourceFileName": "/src/crawlee/_request.py" + }, + "107": { + "qualifiedName": "crawl_depth", + "sourceFileName": "/src/crawlee/_request.py" + }, + "108": { + "qualifiedName": "session_id", + "sourceFileName": "/src/crawlee/_request.py" + }, + "109": { + "qualifiedName": "UserData", + "sourceFileName": "/src/crawlee/_request.py" + }, + "110": { + "qualifiedName": "model_config", + "sourceFileName": "/src/crawlee/_request.py" + }, + "111": { + "qualifiedName": "__pydantic_extra__", + "sourceFileName": "/src/crawlee/_request.py" + }, + "112": { + "qualifiedName": "crawlee_data", + "sourceFileName": "/src/crawlee/_request.py" + }, + "113": { + "qualifiedName": "label", + "sourceFileName": "/src/crawlee/_request.py" + }, + "114": { + "qualifiedName": "__getitem__", + "sourceFileName": "/src/crawlee/_request.py" + }, + "117": { + "qualifiedName": "__setitem__", + "sourceFileName": "/src/crawlee/_request.py" + }, + "121": { + "qualifiedName": "__delitem__", + "sourceFileName": "/src/crawlee/_request.py" + }, + "124": { + "qualifiedName": "__iter__", + "sourceFileName": "/src/crawlee/_request.py" + }, + "126": { + "qualifiedName": "__len__", + "sourceFileName": "/src/crawlee/_request.py" + }, + "128": { + "qualifiedName": "__eq__", + "sourceFileName": "/src/crawlee/_request.py" + }, + "131": { + "qualifiedName": "__hash__", + "sourceFileName": "/src/crawlee/_request.py" + }, + "133": { + "qualifiedName": "user_data_adapter", + "sourceFileName": "/src/crawlee/_request.py" + }, + "134": { + "qualifiedName": "RequestOptions", + "sourceFileName": "/src/crawlee/_request.py" + }, + "135": { + "qualifiedName": "url", + "sourceFileName": "/src/crawlee/_request.py" + }, + "136": { + "qualifiedName": "method", + "sourceFileName": "/src/crawlee/_request.py" + }, + "137": { + "qualifiedName": "headers", + "sourceFileName": "/src/crawlee/_request.py" + }, + "138": { + "qualifiedName": "payload", + "sourceFileName": "/src/crawlee/_request.py" + }, + "139": { + "qualifiedName": "label", + "sourceFileName": "/src/crawlee/_request.py" + }, + "140": { + "qualifiedName": "session_id", + "sourceFileName": "/src/crawlee/_request.py" + }, + "141": { + "qualifiedName": "unique_key", + "sourceFileName": "/src/crawlee/_request.py" + }, + "142": { + "qualifiedName": "id", + "sourceFileName": "/src/crawlee/_request.py" + }, + "143": { + "qualifiedName": "keep_url_fragment", + "sourceFileName": "/src/crawlee/_request.py" + }, + "144": { + "qualifiedName": "use_extended_unique_key", + "sourceFileName": "/src/crawlee/_request.py" + }, + "145": { + "qualifiedName": "always_enqueue", + "sourceFileName": "/src/crawlee/_request.py" + }, + "146": { + "qualifiedName": "user_data", + "sourceFileName": "/src/crawlee/_request.py" + }, + "147": { + "qualifiedName": "no_retry", + "sourceFileName": "/src/crawlee/_request.py" + }, + "148": { + "qualifiedName": "enqueue_strategy", + "sourceFileName": "/src/crawlee/_request.py" + }, + "149": { + "qualifiedName": "max_retries", + "sourceFileName": "/src/crawlee/_request.py" + }, + "150": { + "qualifiedName": "Request", + "sourceFileName": "/src/crawlee/_request.py" + }, + "151": { + "qualifiedName": "model_config", + "sourceFileName": "/src/crawlee/_request.py" + }, + "152": { + "qualifiedName": "unique_key", + "sourceFileName": "/src/crawlee/_request.py" + }, + "153": { + "qualifiedName": "url", + "sourceFileName": "/src/crawlee/_request.py" + }, + "154": { + "qualifiedName": "method", + "sourceFileName": "/src/crawlee/_request.py" + }, + "155": { + "qualifiedName": "payload", + "sourceFileName": "/src/crawlee/_request.py" + }, + "156": { + "qualifiedName": "retry_count", + "sourceFileName": "/src/crawlee/_request.py" + }, + "157": { + "qualifiedName": "no_retry", + "sourceFileName": "/src/crawlee/_request.py" + }, + "158": { + "qualifiedName": "loaded_url", + "sourceFileName": "/src/crawlee/_request.py" + }, + "159": { + "qualifiedName": "handled_at", + "sourceFileName": "/src/crawlee/_request.py" + }, + "160": { + "qualifiedName": "from_url", + "sourceFileName": "/src/crawlee/_request.py" + }, + "175": { + "qualifiedName": "get_query_param_from_url", + "sourceFileName": "/src/crawlee/_request.py" + }, + "179": { + "qualifiedName": "label", + "sourceFileName": "/src/crawlee/_request.py" + }, + "180": { + "qualifiedName": "session_id", + "sourceFileName": "/src/crawlee/_request.py" + }, + "181": { + "qualifiedName": "crawlee_data", + "sourceFileName": "/src/crawlee/_request.py" + }, + "182": { + "qualifiedName": "crawl_depth", + "sourceFileName": "/src/crawlee/_request.py" + }, + "183": { + "qualifiedName": "crawl_depth", + "sourceFileName": "/src/crawlee/_request.py" + }, + "186": { + "qualifiedName": "state", + "sourceFileName": "/src/crawlee/_request.py" + }, + "187": { + "qualifiedName": "state", + "sourceFileName": "/src/crawlee/_request.py" + }, + "190": { + "qualifiedName": "max_retries", + "sourceFileName": "/src/crawlee/_request.py" + }, + "191": { + "qualifiedName": "session_rotation_count", + "sourceFileName": "/src/crawlee/_request.py" + }, + "192": { + "qualifiedName": "session_rotation_count", + "sourceFileName": "/src/crawlee/_request.py" + }, + "195": { + "qualifiedName": "enqueue_strategy", + "sourceFileName": "/src/crawlee/_request.py" + }, + "196": { + "qualifiedName": "enqueue_strategy", + "sourceFileName": "/src/crawlee/_request.py" + }, + "199": { + "qualifiedName": "last_proxy_tier", + "sourceFileName": "/src/crawlee/_request.py" + }, + "200": { + "qualifiedName": "last_proxy_tier", + "sourceFileName": "/src/crawlee/_request.py" + }, + "203": { + "qualifiedName": "forefront", + "sourceFileName": "/src/crawlee/_request.py" + }, + "204": { + "qualifiedName": "forefront", + "sourceFileName": "/src/crawlee/_request.py" + }, + "207": { + "qualifiedName": "was_already_handled", + "sourceFileName": "/src/crawlee/_request.py" + }, + "208": { + "qualifiedName": "RequestWithLock", + "sourceFileName": "/src/crawlee/_request.py" + }, + "209": { + "qualifiedName": "lock_expires_at", + "sourceFileName": "/src/crawlee/_request.py" + }, + "210": { + "qualifiedName": "logger", + "sourceFileName": "/src/crawlee/_service_locator.py" + }, + "211": { + "qualifiedName": "ServiceLocator", + "sourceFileName": "/src/crawlee/_service_locator.py" + }, + "212": { + "qualifiedName": "global_storage_instance_manager", + "sourceFileName": "/src/crawlee/_service_locator.py" + }, + "213": { + "qualifiedName": "__init__", + "sourceFileName": "/src/crawlee/_service_locator.py" + }, + "218": { + "qualifiedName": "get_configuration", + "sourceFileName": "/src/crawlee/_service_locator.py" + }, + "220": { + "qualifiedName": "set_configuration", + "sourceFileName": "/src/crawlee/_service_locator.py" + }, + "223": { + "qualifiedName": "get_event_manager", + "sourceFileName": "/src/crawlee/_service_locator.py" + }, + "225": { + "qualifiedName": "set_event_manager", + "sourceFileName": "/src/crawlee/_service_locator.py" + }, + "228": { + "qualifiedName": "get_storage_client", + "sourceFileName": "/src/crawlee/_service_locator.py" + }, + "230": { + "qualifiedName": "set_storage_client", + "sourceFileName": "/src/crawlee/_service_locator.py" + }, + "233": { + "qualifiedName": "storage_instance_manager", + "sourceFileName": "/src/crawlee/_service_locator.py" + }, + "234": { + "qualifiedName": "service_locator", + "sourceFileName": "/src/crawlee/_service_locator.py" + }, + "235": { + "qualifiedName": "Configuration", + "sourceFileName": "/src/crawlee/configuration.py" + }, + "236": { + "qualifiedName": "model_config", + "sourceFileName": "/src/crawlee/configuration.py" + }, + "237": { + "qualifiedName": "internal_timeout", + "sourceFileName": "/src/crawlee/configuration.py" + }, + "238": { + "qualifiedName": "default_browser_path", + "sourceFileName": "/src/crawlee/configuration.py" + }, + "239": { + "qualifiedName": "disable_browser_sandbox", + "sourceFileName": "/src/crawlee/configuration.py" + }, + "240": { + "qualifiedName": "log_level", + "sourceFileName": "/src/crawlee/configuration.py" + }, + "241": { + "qualifiedName": "purge_on_start", + "sourceFileName": "/src/crawlee/configuration.py" + }, + "242": { + "qualifiedName": "persist_state_interval", + "sourceFileName": "/src/crawlee/configuration.py" + }, + "243": { + "qualifiedName": "system_info_interval", + "sourceFileName": "/src/crawlee/configuration.py" + }, + "244": { + "qualifiedName": "max_used_cpu_ratio", + "sourceFileName": "/src/crawlee/configuration.py" + }, + "245": { + "qualifiedName": "max_used_memory_ratio", + "sourceFileName": "/src/crawlee/configuration.py" + }, + "246": { + "qualifiedName": "max_event_loop_delay", + "sourceFileName": "/src/crawlee/configuration.py" + }, + "247": { + "qualifiedName": "max_client_errors", + "sourceFileName": "/src/crawlee/configuration.py" + }, + "248": { + "qualifiedName": "memory_mbytes", + "sourceFileName": "/src/crawlee/configuration.py" + }, + "249": { + "qualifiedName": "available_memory_ratio", + "sourceFileName": "/src/crawlee/configuration.py" + }, + "250": { + "qualifiedName": "storage_dir", + "sourceFileName": "/src/crawlee/configuration.py" + }, + "251": { + "qualifiedName": "headless", + "sourceFileName": "/src/crawlee/configuration.py" + }, + "252": { + "qualifiedName": "get_global_configuration", + "sourceFileName": "/src/crawlee/configuration.py" + }, + "254": { + "qualifiedName": "ProxyInfo", + "sourceFileName": "/src/crawlee/proxy_configuration.py" + }, + "255": { + "qualifiedName": "url", + "sourceFileName": "/src/crawlee/proxy_configuration.py" + }, + "256": { + "qualifiedName": "scheme", + "sourceFileName": "/src/crawlee/proxy_configuration.py" + }, + "257": { + "qualifiedName": "hostname", + "sourceFileName": "/src/crawlee/proxy_configuration.py" + }, + "258": { + "qualifiedName": "port", + "sourceFileName": "/src/crawlee/proxy_configuration.py" + }, + "259": { + "qualifiedName": "username", + "sourceFileName": "/src/crawlee/proxy_configuration.py" + }, + "260": { + "qualifiedName": "password", + "sourceFileName": "/src/crawlee/proxy_configuration.py" + }, + "261": { + "qualifiedName": "session_id", + "sourceFileName": "/src/crawlee/proxy_configuration.py" + }, + "262": { + "qualifiedName": "proxy_tier", + "sourceFileName": "/src/crawlee/proxy_configuration.py" + }, + "263": { + "qualifiedName": "ProxyConfiguration", + "sourceFileName": "/src/crawlee/proxy_configuration.py" + }, + "264": { + "qualifiedName": "__init__", + "sourceFileName": "/src/crawlee/proxy_configuration.py" + }, + "269": { + "qualifiedName": "new_proxy_info", + "sourceFileName": "/src/crawlee/proxy_configuration.py" + }, + "274": { + "qualifiedName": "new_url", + "sourceFileName": "/src/crawlee/proxy_configuration.py" + }, + "279": { + "qualifiedName": "_ProxyTierTracker", + "sourceFileName": "/src/crawlee/proxy_configuration.py" + }, + "280": { + "qualifiedName": "__init__", + "sourceFileName": "/src/crawlee/proxy_configuration.py" + }, + "283": { + "qualifiedName": "all_urls", + "sourceFileName": "/src/crawlee/proxy_configuration.py" + }, + "284": { + "qualifiedName": "get_tier_urls", + "sourceFileName": "/src/crawlee/proxy_configuration.py" + }, + "287": { + "qualifiedName": "add_error", + "sourceFileName": "/src/crawlee/proxy_configuration.py" + }, + "291": { + "qualifiedName": "predict_tier", + "sourceFileName": "/src/crawlee/proxy_configuration.py" + }, + "294": { + "qualifiedName": "_NewUrlFunction", + "sourceFileName": "/src/crawlee/proxy_configuration.py" + }, + "295": { + "qualifiedName": "__call__", + "sourceFileName": "/src/crawlee/proxy_configuration.py" + }, + "299": { + "qualifiedName": "T", + "sourceFileName": "/src/crawlee/_types.py" + }, + "300": { + "qualifiedName": "HttpMethod", + "sourceFileName": "/src/crawlee/_types.py" + }, + "301": { + "qualifiedName": "HttpPayload", + "sourceFileName": "/src/crawlee/_types.py" + }, + "302": { + "qualifiedName": "RequestTransformAction", + "sourceFileName": "/src/crawlee/_types.py" + }, + "303": { + "qualifiedName": "EnqueueStrategy", + "sourceFileName": "/src/crawlee/_types.py" + }, + "304": { + "qualifiedName": "SkippedReason", + "sourceFileName": "/src/crawlee/_types.py" + }, + "305": { + "qualifiedName": "LogLevel", + "sourceFileName": "/src/crawlee/_types.py" + }, + "306": { + "qualifiedName": "HttpHeaders", + "sourceFileName": "/src/crawlee/_types.py" + }, + "307": { + "qualifiedName": "model_config", + "sourceFileName": "/src/crawlee/_types.py" + }, + "308": { + "qualifiedName": "__getitem__", + "sourceFileName": "/src/crawlee/_types.py" + }, + "311": { + "qualifiedName": "__setitem__", + "sourceFileName": "/src/crawlee/_types.py" + }, + "315": { + "qualifiedName": "__delitem__", + "sourceFileName": "/src/crawlee/_types.py" + }, + "318": { + "qualifiedName": "__or__", + "sourceFileName": "/src/crawlee/_types.py" + }, + "321": { + "qualifiedName": "__ror__", + "sourceFileName": "/src/crawlee/_types.py" + }, + "324": { + "qualifiedName": "__iter__", + "sourceFileName": "/src/crawlee/_types.py" + }, + "326": { + "qualifiedName": "__len__", + "sourceFileName": "/src/crawlee/_types.py" + }, + "328": { + "qualifiedName": "ConcurrencySettings", + "sourceFileName": "/src/crawlee/_types.py" + }, + "329": { + "qualifiedName": "__init__", + "sourceFileName": "/src/crawlee/_types.py" + }, + "335": { + "qualifiedName": "EnqueueLinksKwargs", + "sourceFileName": "/src/crawlee/_types.py" + }, + "336": { + "qualifiedName": "limit", + "sourceFileName": "/src/crawlee/_types.py" + }, + "337": { + "qualifiedName": "base_url", + "sourceFileName": "/src/crawlee/_types.py" + }, + "338": { + "qualifiedName": "strategy", + "sourceFileName": "/src/crawlee/_types.py" + }, + "339": { + "qualifiedName": "include", + "sourceFileName": "/src/crawlee/_types.py" + }, + "340": { + "qualifiedName": "exclude", + "sourceFileName": "/src/crawlee/_types.py" + }, + "341": { + "qualifiedName": "AddRequestsKwargs", + "sourceFileName": "/src/crawlee/_types.py" + }, + "342": { + "qualifiedName": "requests", + "sourceFileName": "/src/crawlee/_types.py" + }, + "343": { + "qualifiedName": "rq_id", + "sourceFileName": "/src/crawlee/_types.py" + }, + "344": { + "qualifiedName": "rq_name", + "sourceFileName": "/src/crawlee/_types.py" + }, + "345": { + "qualifiedName": "rq_alias", + "sourceFileName": "/src/crawlee/_types.py" + }, + "346": { + "qualifiedName": "PushDataKwargs", + "sourceFileName": "/src/crawlee/_types.py" + }, + "347": { + "qualifiedName": "PushDataFunctionCall", + "sourceFileName": "/src/crawlee/_types.py" + }, + "348": { + "qualifiedName": "data", + "sourceFileName": "/src/crawlee/_types.py" + }, + "349": { + "qualifiedName": "dataset_id", + "sourceFileName": "/src/crawlee/_types.py" + }, + "350": { + "qualifiedName": "dataset_name", + "sourceFileName": "/src/crawlee/_types.py" + }, + "351": { + "qualifiedName": "dataset_alias", + "sourceFileName": "/src/crawlee/_types.py" + }, + "352": { + "qualifiedName": "KeyValueStoreInterface", + "sourceFileName": "/src/crawlee/_types.py" + }, + "353": { + "qualifiedName": "get_value", + "sourceFileName": "/src/crawlee/_types.py" + }, + "357": { + "qualifiedName": "set_value", + "sourceFileName": "/src/crawlee/_types.py" + }, + "370": { + "qualifiedName": "KeyValueStoreValue", + "sourceFileName": "/src/crawlee/_types.py" + }, + "371": { + "qualifiedName": "content", + "sourceFileName": "/src/crawlee/_types.py" + }, + "372": { + "qualifiedName": "content_type", + "sourceFileName": "/src/crawlee/_types.py" + }, + "373": { + "qualifiedName": "KeyValueStoreChangeRecords", + "sourceFileName": "/src/crawlee/_types.py" + }, + "374": { + "qualifiedName": "__init__", + "sourceFileName": "/src/crawlee/_types.py" + }, + "377": { + "qualifiedName": "set_value", + "sourceFileName": "/src/crawlee/_types.py" + }, + "382": { + "qualifiedName": "get_value", + "sourceFileName": "/src/crawlee/_types.py" + }, + "394": { + "qualifiedName": "RequestHandlerRunResult", + "sourceFileName": "/src/crawlee/_types.py" + }, + "395": { + "qualifiedName": "__init__", + "sourceFileName": "/src/crawlee/_types.py" + }, + "399": { + "qualifiedName": "request", + "sourceFileName": "/src/crawlee/_types.py" + }, + "400": { + "qualifiedName": "add_requests", + "sourceFileName": "/src/crawlee/_types.py" + }, + "407": { + "qualifiedName": "push_data", + "sourceFileName": "/src/crawlee/_types.py" + }, + "414": { + "qualifiedName": "get_key_value_store", + "sourceFileName": "/src/crawlee/_types.py" + }, + "419": { + "qualifiedName": "apply_request_changes", + "sourceFileName": "/src/crawlee/_types.py" + }, + "422": { + "qualifiedName": "AddRequestsFunction", + "sourceFileName": "/src/crawlee/_types.py" + }, + "423": { + "qualifiedName": "__call__", + "sourceFileName": "/src/crawlee/_types.py" + }, + "430": { + "qualifiedName": "EnqueueLinksFunction", + "sourceFileName": "/src/crawlee/_types.py" + }, + "431": { + "qualifiedName": "__call__", + "sourceFileName": "/src/crawlee/_types.py" + }, + "459": { + "qualifiedName": "ExtractLinksFunction", + "sourceFileName": "/src/crawlee/_types.py" + }, + "460": { + "qualifiedName": "__call__", + "sourceFileName": "/src/crawlee/_types.py" + }, + "468": { + "qualifiedName": "GetKeyValueStoreFunction", + "sourceFileName": "/src/crawlee/_types.py" + }, + "469": { + "qualifiedName": "__call__", + "sourceFileName": "/src/crawlee/_types.py" + }, + "474": { + "qualifiedName": "GetKeyValueStoreFromRequestHandlerFunction", + "sourceFileName": "/src/crawlee/_types.py" + }, + "475": { + "qualifiedName": "__call__", + "sourceFileName": "/src/crawlee/_types.py" + }, + "480": { + "qualifiedName": "PushDataFunction", + "sourceFileName": "/src/crawlee/_types.py" + }, + "481": { + "qualifiedName": "__call__", + "sourceFileName": "/src/crawlee/_types.py" + }, + "488": { + "qualifiedName": "SendRequestFunction", + "sourceFileName": "/src/crawlee/_types.py" + }, + "489": { + "qualifiedName": "__call__", + "sourceFileName": "/src/crawlee/_types.py" + }, + "495": { + "qualifiedName": "PageSnapshot", + "sourceFileName": "/src/crawlee/_types.py" + }, + "496": { + "qualifiedName": "screenshot", + "sourceFileName": "/src/crawlee/_types.py" + }, + "497": { + "qualifiedName": "html", + "sourceFileName": "/src/crawlee/_types.py" + }, + "498": { + "qualifiedName": "__bool__", + "sourceFileName": "/src/crawlee/_types.py" + }, + "500": { + "qualifiedName": "UseStateFunction", + "sourceFileName": "/src/crawlee/_types.py" + }, + "501": { + "qualifiedName": "__call__", + "sourceFileName": "/src/crawlee/_types.py" + }, + "504": { + "qualifiedName": "BasicCrawlingContext", + "sourceFileName": "/src/crawlee/_types.py" + }, + "505": { + "qualifiedName": "request", + "sourceFileName": "/src/crawlee/_types.py" + }, + "506": { + "qualifiedName": "session", + "sourceFileName": "/src/crawlee/_types.py" + }, + "507": { + "qualifiedName": "proxy_info", + "sourceFileName": "/src/crawlee/_types.py" + }, + "508": { + "qualifiedName": "send_request", + "sourceFileName": "/src/crawlee/_types.py" + }, + "509": { + "qualifiedName": "add_requests", + "sourceFileName": "/src/crawlee/_types.py" + }, + "510": { + "qualifiedName": "push_data", + "sourceFileName": "/src/crawlee/_types.py" + }, + "511": { + "qualifiedName": "use_state", + "sourceFileName": "/src/crawlee/_types.py" + }, + "512": { + "qualifiedName": "get_key_value_store", + "sourceFileName": "/src/crawlee/_types.py" + }, + "513": { + "qualifiedName": "log", + "sourceFileName": "/src/crawlee/_types.py" + }, + "514": { + "qualifiedName": "get_snapshot", + "sourceFileName": "/src/crawlee/_types.py" + }, + "516": { + "qualifiedName": "__hash__", + "sourceFileName": "/src/crawlee/_types.py" + }, + "518": { + "qualifiedName": "create_modified_copy", + "sourceFileName": "/src/crawlee/_types.py" + }, + "523": { + "qualifiedName": "GetDataKwargs", + "sourceFileName": "/src/crawlee/_types.py" + }, + "524": { + "qualifiedName": "offset", + "sourceFileName": "/src/crawlee/_types.py" + }, + "525": { + "qualifiedName": "limit", + "sourceFileName": "/src/crawlee/_types.py" + }, + "526": { + "qualifiedName": "clean", + "sourceFileName": "/src/crawlee/_types.py" + }, + "527": { + "qualifiedName": "desc", + "sourceFileName": "/src/crawlee/_types.py" + }, + "528": { + "qualifiedName": "fields", + "sourceFileName": "/src/crawlee/_types.py" + }, + "529": { + "qualifiedName": "omit", + "sourceFileName": "/src/crawlee/_types.py" + }, + "530": { + "qualifiedName": "unwind", + "sourceFileName": "/src/crawlee/_types.py" + }, + "531": { + "qualifiedName": "skip_empty", + "sourceFileName": "/src/crawlee/_types.py" + }, + "532": { + "qualifiedName": "skip_hidden", + "sourceFileName": "/src/crawlee/_types.py" + }, + "533": { + "qualifiedName": "flatten", + "sourceFileName": "/src/crawlee/_types.py" + }, + "534": { + "qualifiedName": "view", + "sourceFileName": "/src/crawlee/_types.py" + }, + "535": { + "qualifiedName": "ExportToKwargs", + "sourceFileName": "/src/crawlee/_types.py" + }, + "536": { + "qualifiedName": "key", + "sourceFileName": "/src/crawlee/_types.py" + }, + "537": { + "qualifiedName": "content_type", + "sourceFileName": "/src/crawlee/_types.py" + }, + "538": { + "qualifiedName": "to_kvs_id", + "sourceFileName": "/src/crawlee/_types.py" + }, + "539": { + "qualifiedName": "to_kvs_name", + "sourceFileName": "/src/crawlee/_types.py" + }, + "540": { + "qualifiedName": "to_kvs_storage_client", + "sourceFileName": "/src/crawlee/_types.py" + }, + "541": { + "qualifiedName": "to_kvs_configuration", + "sourceFileName": "/src/crawlee/_types.py" + }, + "542": { + "qualifiedName": "ExportDataJsonKwargs", + "sourceFileName": "/src/crawlee/_types.py" + }, + "543": { + "qualifiedName": "skipkeys", + "sourceFileName": "/src/crawlee/_types.py" + }, + "544": { + "qualifiedName": "ensure_ascii", + "sourceFileName": "/src/crawlee/_types.py" + }, + "545": { + "qualifiedName": "check_circular", + "sourceFileName": "/src/crawlee/_types.py" + }, + "546": { + "qualifiedName": "allow_nan", + "sourceFileName": "/src/crawlee/_types.py" + }, + "547": { + "qualifiedName": "cls", + "sourceFileName": "/src/crawlee/_types.py" + }, + "548": { + "qualifiedName": "indent", + "sourceFileName": "/src/crawlee/_types.py" + }, + "549": { + "qualifiedName": "separators", + "sourceFileName": "/src/crawlee/_types.py" + }, + "550": { + "qualifiedName": "default", + "sourceFileName": "/src/crawlee/_types.py" + }, + "551": { + "qualifiedName": "sort_keys", + "sourceFileName": "/src/crawlee/_types.py" + }, + "552": { + "qualifiedName": "ExportDataCsvKwargs", + "sourceFileName": "/src/crawlee/_types.py" + }, + "553": { + "qualifiedName": "dialect", + "sourceFileName": "/src/crawlee/_types.py" + }, + "554": { + "qualifiedName": "delimiter", + "sourceFileName": "/src/crawlee/_types.py" + }, + "555": { + "qualifiedName": "doublequote", + "sourceFileName": "/src/crawlee/_types.py" + }, + "556": { + "qualifiedName": "escapechar", + "sourceFileName": "/src/crawlee/_types.py" + }, + "557": { + "qualifiedName": "lineterminator", + "sourceFileName": "/src/crawlee/_types.py" + }, + "558": { + "qualifiedName": "quotechar", + "sourceFileName": "/src/crawlee/_types.py" + }, + "559": { + "qualifiedName": "quoting", + "sourceFileName": "/src/crawlee/_types.py" + }, + "560": { + "qualifiedName": "skipinitialspace", + "sourceFileName": "/src/crawlee/_types.py" + }, + "561": { + "qualifiedName": "strict", + "sourceFileName": "/src/crawlee/_types.py" + }, + "562": { + "qualifiedName": "logger", + "sourceFileName": "/src/crawlee/_autoscaling/autoscaled_pool.py" + }, + "563": { + "qualifiedName": "AbortError", + "sourceFileName": "/src/crawlee/_autoscaling/autoscaled_pool.py" + }, + "564": { + "qualifiedName": "_AutoscaledPoolRun", + "sourceFileName": "/src/crawlee/_autoscaling/autoscaled_pool.py" + }, + "565": { + "qualifiedName": "__init__", + "sourceFileName": "/src/crawlee/_autoscaling/autoscaled_pool.py" + }, + "567": { + "qualifiedName": "AutoscaledPool", + "sourceFileName": "/src/crawlee/_autoscaling/autoscaled_pool.py" + }, + "568": { + "qualifiedName": "__init__", + "sourceFileName": "/src/crawlee/_autoscaling/autoscaled_pool.py" + }, + "575": { + "qualifiedName": "run", + "sourceFileName": "/src/crawlee/_autoscaling/autoscaled_pool.py" + }, + "577": { + "qualifiedName": "abort", + "sourceFileName": "/src/crawlee/_autoscaling/autoscaled_pool.py" + }, + "579": { + "qualifiedName": "pause", + "sourceFileName": "/src/crawlee/_autoscaling/autoscaled_pool.py" + }, + "581": { + "qualifiedName": "resume", + "sourceFileName": "/src/crawlee/_autoscaling/autoscaled_pool.py" + }, + "583": { + "qualifiedName": "desired_concurrency", + "sourceFileName": "/src/crawlee/_autoscaling/autoscaled_pool.py" + }, + "584": { + "qualifiedName": "current_concurrency", + "sourceFileName": "/src/crawlee/_autoscaling/autoscaled_pool.py" + }, + "585": { + "qualifiedName": "logger", + "sourceFileName": "/src/crawlee/_autoscaling/system_status.py" + }, + "586": { + "qualifiedName": "SystemStatus", + "sourceFileName": "/src/crawlee/_autoscaling/system_status.py" + }, + "587": { + "qualifiedName": "__init__", + "sourceFileName": "/src/crawlee/_autoscaling/system_status.py" + }, + "595": { + "qualifiedName": "get_current_system_info", + "sourceFileName": "/src/crawlee/_autoscaling/system_status.py" + }, + "597": { + "qualifiedName": "get_historical_system_info", + "sourceFileName": "/src/crawlee/_autoscaling/system_status.py" + }, + "599": { + "qualifiedName": "SYSTEM_WIDE_MEMORY_OVERLOAD_THRESHOLD", + "sourceFileName": "/src/crawlee/_autoscaling/_types.py" + }, + "600": { + "qualifiedName": "LoadRatioInfo", + "sourceFileName": "/src/crawlee/_autoscaling/_types.py" + }, + "601": { + "qualifiedName": "limit_ratio", + "sourceFileName": "/src/crawlee/_autoscaling/_types.py" + }, + "602": { + "qualifiedName": "actual_ratio", + "sourceFileName": "/src/crawlee/_autoscaling/_types.py" + }, + "603": { + "qualifiedName": "is_overloaded", + "sourceFileName": "/src/crawlee/_autoscaling/_types.py" + }, + "604": { + "qualifiedName": "SystemInfo", + "sourceFileName": "/src/crawlee/_autoscaling/_types.py" + }, + "605": { + "qualifiedName": "cpu_info", + "sourceFileName": "/src/crawlee/_autoscaling/_types.py" + }, + "606": { + "qualifiedName": "memory_info", + "sourceFileName": "/src/crawlee/_autoscaling/_types.py" + }, + "607": { + "qualifiedName": "event_loop_info", + "sourceFileName": "/src/crawlee/_autoscaling/_types.py" + }, + "608": { + "qualifiedName": "client_info", + "sourceFileName": "/src/crawlee/_autoscaling/_types.py" + }, + "609": { + "qualifiedName": "created_at", + "sourceFileName": "/src/crawlee/_autoscaling/_types.py" + }, + "610": { + "qualifiedName": "is_system_idle", + "sourceFileName": "/src/crawlee/_autoscaling/_types.py" + }, + "611": { + "qualifiedName": "__str__", + "sourceFileName": "/src/crawlee/_autoscaling/_types.py" + }, + "613": { + "qualifiedName": "CpuSnapshot", + "sourceFileName": "/src/crawlee/_autoscaling/_types.py" + }, + "614": { + "qualifiedName": "used_ratio", + "sourceFileName": "/src/crawlee/_autoscaling/_types.py" + }, + "615": { + "qualifiedName": "max_used_ratio", + "sourceFileName": "/src/crawlee/_autoscaling/_types.py" + }, + "616": { + "qualifiedName": "created_at", + "sourceFileName": "/src/crawlee/_autoscaling/_types.py" + }, + "617": { + "qualifiedName": "is_overloaded", + "sourceFileName": "/src/crawlee/_autoscaling/_types.py" + }, + "618": { + "qualifiedName": "MemorySnapshot", + "sourceFileName": "/src/crawlee/_autoscaling/_types.py" + }, + "619": { + "qualifiedName": "current_size", + "sourceFileName": "/src/crawlee/_autoscaling/_types.py" + }, + "620": { + "qualifiedName": "system_wide_used_size", + "sourceFileName": "/src/crawlee/_autoscaling/_types.py" + }, + "621": { + "qualifiedName": "max_memory_size", + "sourceFileName": "/src/crawlee/_autoscaling/_types.py" + }, + "622": { + "qualifiedName": "system_wide_memory_size", + "sourceFileName": "/src/crawlee/_autoscaling/_types.py" + }, + "623": { + "qualifiedName": "max_used_memory_ratio", + "sourceFileName": "/src/crawlee/_autoscaling/_types.py" + }, + "624": { + "qualifiedName": "created_at", + "sourceFileName": "/src/crawlee/_autoscaling/_types.py" + }, + "625": { + "qualifiedName": "is_overloaded", + "sourceFileName": "/src/crawlee/_autoscaling/_types.py" + }, + "626": { + "qualifiedName": "EventLoopSnapshot", + "sourceFileName": "/src/crawlee/_autoscaling/_types.py" + }, + "627": { + "qualifiedName": "delay", + "sourceFileName": "/src/crawlee/_autoscaling/_types.py" + }, + "628": { + "qualifiedName": "max_delay", + "sourceFileName": "/src/crawlee/_autoscaling/_types.py" + }, + "629": { + "qualifiedName": "created_at", + "sourceFileName": "/src/crawlee/_autoscaling/_types.py" + }, + "630": { + "qualifiedName": "max_delay_exceeded", + "sourceFileName": "/src/crawlee/_autoscaling/_types.py" + }, + "631": { + "qualifiedName": "is_overloaded", + "sourceFileName": "/src/crawlee/_autoscaling/_types.py" + }, + "632": { + "qualifiedName": "ClientSnapshot", + "sourceFileName": "/src/crawlee/_autoscaling/_types.py" + }, + "633": { + "qualifiedName": "error_count", + "sourceFileName": "/src/crawlee/_autoscaling/_types.py" + }, + "634": { + "qualifiedName": "new_error_count", + "sourceFileName": "/src/crawlee/_autoscaling/_types.py" + }, + "635": { + "qualifiedName": "max_error_count", + "sourceFileName": "/src/crawlee/_autoscaling/_types.py" + }, + "636": { + "qualifiedName": "created_at", + "sourceFileName": "/src/crawlee/_autoscaling/_types.py" + }, + "637": { + "qualifiedName": "is_overloaded", + "sourceFileName": "/src/crawlee/_autoscaling/_types.py" + }, + "638": { + "qualifiedName": "Snapshot", + "sourceFileName": "/src/crawlee/_autoscaling/_types.py" + }, + "639": { + "qualifiedName": "Ratio", + "sourceFileName": "/src/crawlee/_autoscaling/_types.py" + }, + "640": { + "qualifiedName": "value", + "sourceFileName": "/src/crawlee/_autoscaling/_types.py" + }, + "641": { + "qualifiedName": "logger", + "sourceFileName": "/src/crawlee/_autoscaling/snapshotter.py" + }, + "642": { + "qualifiedName": "T", + "sourceFileName": "/src/crawlee/_autoscaling/snapshotter.py" + }, + "643": { + "qualifiedName": "SortedSnapshotList", + "sourceFileName": "/src/crawlee/_autoscaling/snapshotter.py" + }, + "644": { + "qualifiedName": "add", + "sourceFileName": "/src/crawlee/_autoscaling/snapshotter.py" + }, + "647": { + "qualifiedName": "Snapshotter", + "sourceFileName": "/src/crawlee/_autoscaling/snapshotter.py" + }, + "648": { + "qualifiedName": "__init__", + "sourceFileName": "/src/crawlee/_autoscaling/snapshotter.py" + }, + "655": { + "qualifiedName": "from_config", + "sourceFileName": "/src/crawlee/_autoscaling/snapshotter.py" + }, + "658": { + "qualifiedName": "active", + "sourceFileName": "/src/crawlee/_autoscaling/snapshotter.py" + }, + "659": { + "qualifiedName": "__aenter__", + "sourceFileName": "/src/crawlee/_autoscaling/snapshotter.py" + }, + "661": { + "qualifiedName": "__aexit__", + "sourceFileName": "/src/crawlee/_autoscaling/snapshotter.py" + }, + "666": { + "qualifiedName": "get_memory_sample", + "sourceFileName": "/src/crawlee/_autoscaling/snapshotter.py" + }, + "669": { + "qualifiedName": "get_event_loop_sample", + "sourceFileName": "/src/crawlee/_autoscaling/snapshotter.py" + }, + "672": { + "qualifiedName": "get_cpu_sample", + "sourceFileName": "/src/crawlee/_autoscaling/snapshotter.py" + }, + "675": { + "qualifiedName": "get_client_sample", + "sourceFileName": "/src/crawlee/_autoscaling/snapshotter.py" + }, + "678": { + "qualifiedName": "CLOUDFLARE_RETRY_CSS_SELECTORS", + "sourceFileName": "/src/crawlee/_utils/blocked.py" + }, + "679": { + "qualifiedName": "RETRY_CSS_SELECTORS", + "sourceFileName": "/src/crawlee/_utils/blocked.py" + }, + "680": { + "qualifiedName": "ROTATE_PROXY_ERRORS", + "sourceFileName": "/src/crawlee/_utils/blocked.py" + }, + "681": { + "qualifiedName": "ByteSize", + "sourceFileName": "/src/crawlee/_utils/byte_size.py" + }, + "682": { + "qualifiedName": "bytes", + "sourceFileName": "/src/crawlee/_utils/byte_size.py" + }, + "683": { + "qualifiedName": "__post_init__", + "sourceFileName": "/src/crawlee/_utils/byte_size.py" + }, + "685": { + "qualifiedName": "validate", + "sourceFileName": "/src/crawlee/_utils/byte_size.py" + }, + "688": { + "qualifiedName": "from_kb", + "sourceFileName": "/src/crawlee/_utils/byte_size.py" + }, + "691": { + "qualifiedName": "from_mb", + "sourceFileName": "/src/crawlee/_utils/byte_size.py" + }, + "694": { + "qualifiedName": "from_gb", + "sourceFileName": "/src/crawlee/_utils/byte_size.py" + }, + "697": { + "qualifiedName": "from_tb", + "sourceFileName": "/src/crawlee/_utils/byte_size.py" + }, + "700": { + "qualifiedName": "to_kb", + "sourceFileName": "/src/crawlee/_utils/byte_size.py" + }, + "702": { + "qualifiedName": "to_mb", + "sourceFileName": "/src/crawlee/_utils/byte_size.py" + }, + "704": { + "qualifiedName": "to_gb", + "sourceFileName": "/src/crawlee/_utils/byte_size.py" + }, + "706": { + "qualifiedName": "to_tb", + "sourceFileName": "/src/crawlee/_utils/byte_size.py" + }, + "708": { + "qualifiedName": "__str__", + "sourceFileName": "/src/crawlee/_utils/byte_size.py" + }, + "710": { + "qualifiedName": "__eq__", + "sourceFileName": "/src/crawlee/_utils/byte_size.py" + }, + "713": { + "qualifiedName": "__hash__", + "sourceFileName": "/src/crawlee/_utils/byte_size.py" + }, + "715": { + "qualifiedName": "__lt__", + "sourceFileName": "/src/crawlee/_utils/byte_size.py" + }, + "718": { + "qualifiedName": "__le__", + "sourceFileName": "/src/crawlee/_utils/byte_size.py" + }, + "721": { + "qualifiedName": "__gt__", + "sourceFileName": "/src/crawlee/_utils/byte_size.py" + }, + "724": { + "qualifiedName": "__ge__", + "sourceFileName": "/src/crawlee/_utils/byte_size.py" + }, + "727": { + "qualifiedName": "__add__", + "sourceFileName": "/src/crawlee/_utils/byte_size.py" + }, + "730": { + "qualifiedName": "__sub__", + "sourceFileName": "/src/crawlee/_utils/byte_size.py" + }, + "733": { + "qualifiedName": "__mul__", + "sourceFileName": "/src/crawlee/_utils/byte_size.py" + }, + "736": { + "qualifiedName": "__truediv__", + "sourceFileName": "/src/crawlee/_utils/byte_size.py" + }, + "739": { + "qualifiedName": "__rmul__", + "sourceFileName": "/src/crawlee/_utils/byte_size.py" + }, + "742": { + "qualifiedName": "BORDER", + "sourceFileName": "/src/crawlee/_utils/console.py" + }, + "743": { + "qualifiedName": "make_table", + "sourceFileName": "/src/crawlee/_utils/console.py" + }, + "747": { + "qualifiedName": "T", + "sourceFileName": "/src/crawlee/_utils/context.py" + }, + "748": { + "qualifiedName": "ensure_context", + "sourceFileName": "/src/crawlee/_utils/context.py" + }, + "751": { + "qualifiedName": "compute_short_hash", + "sourceFileName": "/src/crawlee/_utils/crypto.py" + }, + "755": { + "qualifiedName": "crypto_random_object_id", + "sourceFileName": "/src/crawlee/_utils/crypto.py" + }, + "758": { + "qualifiedName": "GroupName", + "sourceFileName": "/src/crawlee/_utils/docs.py" + }, + "759": { + "qualifiedName": "T", + "sourceFileName": "/src/crawlee/_utils/docs.py" + }, + "760": { + "qualifiedName": "docs_group", + "sourceFileName": "/src/crawlee/_utils/docs.py" + }, + "763": { + "qualifiedName": "infer_mime_type", + "sourceFileName": "/src/crawlee/_utils/file.py" + }, + "766": { + "qualifiedName": "json_dumps", + "sourceFileName": "/src/crawlee/_utils/file.py" + }, + "769": { + "qualifiedName": "atomic_write", + "sourceFileName": "/src/crawlee/_utils/file.py" + }, + "774": { + "qualifiedName": "export_json_to_stream", + "sourceFileName": "/src/crawlee/_utils/file.py" + }, + "779": { + "qualifiedName": "export_csv_to_stream", + "sourceFileName": "/src/crawlee/_utils/file.py" + }, + "784": { + "qualifiedName": "Glob", + "sourceFileName": "/src/crawlee/_utils/globs.py" + }, + "785": { + "qualifiedName": "__init__", + "sourceFileName": "/src/crawlee/_utils/globs.py" + }, + "788": { + "qualifiedName": "SKIP_TAGS", + "sourceFileName": "/src/crawlee/_utils/html_to_text.py" + }, + "789": { + "qualifiedName": "BLOCK_TAGS", + "sourceFileName": "/src/crawlee/_utils/html_to_text.py" + }, + "790": { + "qualifiedName": "timedelta_ms", + "sourceFileName": "/src/crawlee/_utils/models.py" + }, + "791": { + "qualifiedName": "timedelta_secs", + "sourceFileName": "/src/crawlee/_utils/models.py" + }, + "792": { + "qualifiedName": "raise_if_too_many_kwargs", + "sourceFileName": "/src/crawlee/_utils/raise_if_too_many_kwargs.py" + }, + "796": { + "qualifiedName": "TStateModel", + "sourceFileName": "/src/crawlee/_utils/recoverable_state.py" + }, + "797": { + "qualifiedName": "RecoverableState", + "sourceFileName": "/src/crawlee/_utils/recoverable_state.py" + }, + "798": { + "qualifiedName": "__init__", + "sourceFileName": "/src/crawlee/_utils/recoverable_state.py" + }, + "807": { + "qualifiedName": "initialize", + "sourceFileName": "/src/crawlee/_utils/recoverable_state.py" + }, + "809": { + "qualifiedName": "teardown", + "sourceFileName": "/src/crawlee/_utils/recoverable_state.py" + }, + "811": { + "qualifiedName": "current_value", + "sourceFileName": "/src/crawlee/_utils/recoverable_state.py" + }, + "812": { + "qualifiedName": "is_initialized", + "sourceFileName": "/src/crawlee/_utils/recoverable_state.py" + }, + "813": { + "qualifiedName": "has_persisted_state", + "sourceFileName": "/src/crawlee/_utils/recoverable_state.py" + }, + "815": { + "qualifiedName": "reset", + "sourceFileName": "/src/crawlee/_utils/recoverable_state.py" + }, + "817": { + "qualifiedName": "persist_state", + "sourceFileName": "/src/crawlee/_utils/recoverable_state.py" + }, + "820": { + "qualifiedName": "logger", + "sourceFileName": "/src/crawlee/_utils/recurring_task.py" + }, + "821": { + "qualifiedName": "RecurringTask", + "sourceFileName": "/src/crawlee/_utils/recurring_task.py" + }, + "822": { + "qualifiedName": "__init__", + "sourceFileName": "/src/crawlee/_utils/recurring_task.py" + }, + "826": { + "qualifiedName": "__aenter__", + "sourceFileName": "/src/crawlee/_utils/recurring_task.py" + }, + "828": { + "qualifiedName": "__aexit__", + "sourceFileName": "/src/crawlee/_utils/recurring_task.py" + }, + "833": { + "qualifiedName": "start", + "sourceFileName": "/src/crawlee/_utils/recurring_task.py" + }, + "835": { + "qualifiedName": "stop", + "sourceFileName": "/src/crawlee/_utils/recurring_task.py" + }, + "837": { + "qualifiedName": "logger", + "sourceFileName": "/src/crawlee/_utils/requests.py" + }, + "838": { + "qualifiedName": "normalize_url", + "sourceFileName": "/src/crawlee/_utils/requests.py" + }, + "842": { + "qualifiedName": "compute_unique_key", + "sourceFileName": "/src/crawlee/_utils/requests.py" + }, + "851": { + "qualifiedName": "logger", + "sourceFileName": "/src/crawlee/_utils/robots.py" + }, + "852": { + "qualifiedName": "RobotsTxtFile", + "sourceFileName": "/src/crawlee/_utils/robots.py" + }, + "853": { + "qualifiedName": "__init__", + "sourceFileName": "/src/crawlee/_utils/robots.py" + }, + "859": { + "qualifiedName": "from_content", + "sourceFileName": "/src/crawlee/_utils/robots.py" + }, + "863": { + "qualifiedName": "find", + "sourceFileName": "/src/crawlee/_utils/robots.py" + }, + "868": { + "qualifiedName": "load", + "sourceFileName": "/src/crawlee/_utils/robots.py" + }, + "873": { + "qualifiedName": "is_allowed", + "sourceFileName": "/src/crawlee/_utils/robots.py" + }, + "877": { + "qualifiedName": "get_sitemaps", + "sourceFileName": "/src/crawlee/_utils/robots.py" + }, + "879": { + "qualifiedName": "get_crawl_delay", + "sourceFileName": "/src/crawlee/_utils/robots.py" + }, + "882": { + "qualifiedName": "parse_sitemaps", + "sourceFileName": "/src/crawlee/_utils/robots.py" + }, + "884": { + "qualifiedName": "parse_urls_from_sitemaps", + "sourceFileName": "/src/crawlee/_utils/robots.py" + }, + "886": { + "qualifiedName": "logger", + "sourceFileName": "/src/crawlee/_utils/system.py" + }, + "887": { + "qualifiedName": "CpuInfo", + "sourceFileName": "/src/crawlee/_utils/system.py" + }, + "888": { + "qualifiedName": "model_config", + "sourceFileName": "/src/crawlee/_utils/system.py" + }, + "889": { + "qualifiedName": "used_ratio", + "sourceFileName": "/src/crawlee/_utils/system.py" + }, + "890": { + "qualifiedName": "MemoryUsageInfo", + "sourceFileName": "/src/crawlee/_utils/system.py" + }, + "891": { + "qualifiedName": "model_config", + "sourceFileName": "/src/crawlee/_utils/system.py" + }, + "892": { + "qualifiedName": "current_size", + "sourceFileName": "/src/crawlee/_utils/system.py" + }, + "893": { + "qualifiedName": "MemoryInfo", + "sourceFileName": "/src/crawlee/_utils/system.py" + }, + "894": { + "qualifiedName": "model_config", + "sourceFileName": "/src/crawlee/_utils/system.py" + }, + "895": { + "qualifiedName": "total_size", + "sourceFileName": "/src/crawlee/_utils/system.py" + }, + "896": { + "qualifiedName": "system_wide_used_size", + "sourceFileName": "/src/crawlee/_utils/system.py" + }, + "897": { + "qualifiedName": "get_cpu_info", + "sourceFileName": "/src/crawlee/_utils/system.py" + }, + "899": { + "qualifiedName": "get_memory_info", + "sourceFileName": "/src/crawlee/_utils/system.py" + }, + "901": { + "qualifiedName": "TimerResult", + "sourceFileName": "/src/crawlee/_utils/time.py" + }, + "902": { + "qualifiedName": "wall", + "sourceFileName": "/src/crawlee/_utils/time.py" + }, + "903": { + "qualifiedName": "cpu", + "sourceFileName": "/src/crawlee/_utils/time.py" + }, + "904": { + "qualifiedName": "measure_time", + "sourceFileName": "/src/crawlee/_utils/time.py" + }, + "906": { + "qualifiedName": "SharedTimeout", + "sourceFileName": "/src/crawlee/_utils/time.py" + }, + "907": { + "qualifiedName": "__init__", + "sourceFileName": "/src/crawlee/_utils/time.py" + }, + "910": { + "qualifiedName": "__aenter__", + "sourceFileName": "/src/crawlee/_utils/time.py" + }, + "912": { + "qualifiedName": "__aexit__", + "sourceFileName": "/src/crawlee/_utils/time.py" + }, + "917": { + "qualifiedName": "format_duration", + "sourceFileName": "/src/crawlee/_utils/time.py" + }, + "920": { + "qualifiedName": "try_import", + "sourceFileName": "/src/crawlee/_utils/try_import.py" + }, + "924": { + "qualifiedName": "install_import_hook", + "sourceFileName": "/src/crawlee/_utils/try_import.py" + }, + "927": { + "qualifiedName": "FailedImport", + "sourceFileName": "/src/crawlee/_utils/try_import.py" + }, + "928": { + "qualifiedName": "message", + "sourceFileName": "/src/crawlee/_utils/try_import.py" + }, + "929": { + "qualifiedName": "ImportWrapper", + "sourceFileName": "/src/crawlee/_utils/try_import.py" + }, + "930": { + "qualifiedName": "__getattribute__", + "sourceFileName": "/src/crawlee/_utils/try_import.py" + }, + "933": { + "qualifiedName": "is_url_absolute", + "sourceFileName": "/src/crawlee/_utils/urls.py" + }, + "936": { + "qualifiedName": "convert_to_absolute_url", + "sourceFileName": "/src/crawlee/_utils/urls.py" + }, + "940": { + "qualifiedName": "to_absolute_url_iterator", + "sourceFileName": "/src/crawlee/_utils/urls.py" + }, + "945": { + "qualifiedName": "validate_http_url", + "sourceFileName": "/src/crawlee/_utils/urls.py" + }, + "948": { + "qualifiedName": "T", + "sourceFileName": "/src/crawlee/_utils/wait.py" + }, + "949": { + "qualifiedName": "wait_for", + "sourceFileName": "/src/crawlee/_utils/wait.py" + }, + "956": { + "qualifiedName": "wait_for_all_tasks_for_finish", + "sourceFileName": "/src/crawlee/_utils/wait.py" + }, + "961": { + "qualifiedName": "logger", + "sourceFileName": "/src/crawlee/_utils/sitemap.py" + }, + "962": { + "qualifiedName": "VALID_CHANGE_FREQS", + "sourceFileName": "/src/crawlee/_utils/sitemap.py" + }, + "963": { + "qualifiedName": "SITEMAP_HEADERS", + "sourceFileName": "/src/crawlee/_utils/sitemap.py" + }, + "964": { + "qualifiedName": "SITEMAP_URL_PATTERN", + "sourceFileName": "/src/crawlee/_utils/sitemap.py" + }, + "965": { + "qualifiedName": "COMMON_SITEMAP_PATHS", + "sourceFileName": "/src/crawlee/_utils/sitemap.py" + }, + "966": { + "qualifiedName": "SitemapUrl", + "sourceFileName": "/src/crawlee/_utils/sitemap.py" + }, + "967": { + "qualifiedName": "loc", + "sourceFileName": "/src/crawlee/_utils/sitemap.py" + }, + "968": { + "qualifiedName": "lastmod", + "sourceFileName": "/src/crawlee/_utils/sitemap.py" + }, + "969": { + "qualifiedName": "changefreq", + "sourceFileName": "/src/crawlee/_utils/sitemap.py" + }, + "970": { + "qualifiedName": "priority", + "sourceFileName": "/src/crawlee/_utils/sitemap.py" + }, + "971": { + "qualifiedName": "origin_sitemap_url", + "sourceFileName": "/src/crawlee/_utils/sitemap.py" + }, + "972": { + "qualifiedName": "NestedSitemap", + "sourceFileName": "/src/crawlee/_utils/sitemap.py" + }, + "973": { + "qualifiedName": "loc", + "sourceFileName": "/src/crawlee/_utils/sitemap.py" + }, + "974": { + "qualifiedName": "origin_sitemap_url", + "sourceFileName": "/src/crawlee/_utils/sitemap.py" + }, + "975": { + "qualifiedName": "ParseSitemapOptions", + "sourceFileName": "/src/crawlee/_utils/sitemap.py" + }, + "976": { + "qualifiedName": "emit_nested_sitemaps", + "sourceFileName": "/src/crawlee/_utils/sitemap.py" + }, + "977": { + "qualifiedName": "max_depth", + "sourceFileName": "/src/crawlee/_utils/sitemap.py" + }, + "978": { + "qualifiedName": "sitemap_retries", + "sourceFileName": "/src/crawlee/_utils/sitemap.py" + }, + "979": { + "qualifiedName": "timeout", + "sourceFileName": "/src/crawlee/_utils/sitemap.py" + }, + "980": { + "qualifiedName": "SitemapSource", + "sourceFileName": "/src/crawlee/_utils/sitemap.py" + }, + "981": { + "qualifiedName": "type", + "sourceFileName": "/src/crawlee/_utils/sitemap.py" + }, + "982": { + "qualifiedName": "url", + "sourceFileName": "/src/crawlee/_utils/sitemap.py" + }, + "983": { + "qualifiedName": "content", + "sourceFileName": "/src/crawlee/_utils/sitemap.py" + }, + "984": { + "qualifiedName": "depth", + "sourceFileName": "/src/crawlee/_utils/sitemap.py" + }, + "985": { + "qualifiedName": "_SitemapItem", + "sourceFileName": "/src/crawlee/_utils/sitemap.py" + }, + "986": { + "qualifiedName": "type", + "sourceFileName": "/src/crawlee/_utils/sitemap.py" + }, + "987": { + "qualifiedName": "loc", + "sourceFileName": "/src/crawlee/_utils/sitemap.py" + }, + "988": { + "qualifiedName": "url", + "sourceFileName": "/src/crawlee/_utils/sitemap.py" + }, + "989": { + "qualifiedName": "lastmod", + "sourceFileName": "/src/crawlee/_utils/sitemap.py" + }, + "990": { + "qualifiedName": "changefreq", + "sourceFileName": "/src/crawlee/_utils/sitemap.py" + }, + "991": { + "qualifiedName": "priority", + "sourceFileName": "/src/crawlee/_utils/sitemap.py" + }, + "992": { + "qualifiedName": "_XMLSaxSitemapHandler", + "sourceFileName": "/src/crawlee/_utils/sitemap.py" + }, + "993": { + "qualifiedName": "__init__", + "sourceFileName": "/src/crawlee/_utils/sitemap.py" + }, + "995": { + "qualifiedName": "items", + "sourceFileName": "/src/crawlee/_utils/sitemap.py" + }, + "996": { + "qualifiedName": "startElement", + "sourceFileName": "/src/crawlee/_utils/sitemap.py" + }, + "1000": { + "qualifiedName": "characters", + "sourceFileName": "/src/crawlee/_utils/sitemap.py" + }, + "1003": { + "qualifiedName": "endElement", + "sourceFileName": "/src/crawlee/_utils/sitemap.py" + }, + "1006": { + "qualifiedName": "_TxtSitemapParser", + "sourceFileName": "/src/crawlee/_utils/sitemap.py" + }, + "1007": { + "qualifiedName": "__init__", + "sourceFileName": "/src/crawlee/_utils/sitemap.py" + }, + "1009": { + "qualifiedName": "process_chunk", + "sourceFileName": "/src/crawlee/_utils/sitemap.py" + }, + "1012": { + "qualifiedName": "flush", + "sourceFileName": "/src/crawlee/_utils/sitemap.py" + }, + "1014": { + "qualifiedName": "close", + "sourceFileName": "/src/crawlee/_utils/sitemap.py" + }, + "1016": { + "qualifiedName": "_XmlSitemapParser", + "sourceFileName": "/src/crawlee/_utils/sitemap.py" + }, + "1017": { + "qualifiedName": "__init__", + "sourceFileName": "/src/crawlee/_utils/sitemap.py" + }, + "1019": { + "qualifiedName": "process_chunk", + "sourceFileName": "/src/crawlee/_utils/sitemap.py" + }, + "1022": { + "qualifiedName": "flush", + "sourceFileName": "/src/crawlee/_utils/sitemap.py" + }, + "1024": { + "qualifiedName": "close", + "sourceFileName": "/src/crawlee/_utils/sitemap.py" + }, + "1026": { + "qualifiedName": "Sitemap", + "sourceFileName": "/src/crawlee/_utils/sitemap.py" + }, + "1027": { + "qualifiedName": "__init__", + "sourceFileName": "/src/crawlee/_utils/sitemap.py" + }, + "1030": { + "qualifiedName": "urls", + "sourceFileName": "/src/crawlee/_utils/sitemap.py" + }, + "1031": { + "qualifiedName": "try_common_names", + "sourceFileName": "/src/crawlee/_utils/sitemap.py" + }, + "1036": { + "qualifiedName": "load", + "sourceFileName": "/src/crawlee/_utils/sitemap.py" + }, + "1042": { + "qualifiedName": "from_xml_string", + "sourceFileName": "/src/crawlee/_utils/sitemap.py" + }, + "1045": { + "qualifiedName": "parse", + "sourceFileName": "/src/crawlee/_utils/sitemap.py" + }, + "1051": { + "qualifiedName": "parse_sitemap", + "sourceFileName": "/src/crawlee/_utils/sitemap.py" + }, + "1057": { + "qualifiedName": "discover_valid_sitemaps", + "sourceFileName": "/src/crawlee/_utils/sitemap.py" + }, + "1064": { + "qualifiedName": "is_status_code_client_error", + "sourceFileName": "/src/crawlee/_utils/web.py" + }, + "1067": { + "qualifiedName": "is_status_code_server_error", + "sourceFileName": "/src/crawlee/_utils/web.py" + }, + "1070": { + "qualifiedName": "is_status_code_successful", + "sourceFileName": "/src/crawlee/_utils/web.py" + }, + "1073": { + "qualifiedName": "BrowserController", + "sourceFileName": "/src/crawlee/browsers/_browser_controller.py" + }, + "1074": { + "qualifiedName": "AUTOMATION_LIBRARY", + "sourceFileName": "/src/crawlee/browsers/_browser_controller.py" + }, + "1075": { + "qualifiedName": "pages", + "sourceFileName": "/src/crawlee/browsers/_browser_controller.py" + }, + "1076": { + "qualifiedName": "total_opened_pages", + "sourceFileName": "/src/crawlee/browsers/_browser_controller.py" + }, + "1077": { + "qualifiedName": "pages_count", + "sourceFileName": "/src/crawlee/browsers/_browser_controller.py" + }, + "1078": { + "qualifiedName": "last_page_opened_at", + "sourceFileName": "/src/crawlee/browsers/_browser_controller.py" + }, + "1079": { + "qualifiedName": "idle_time", + "sourceFileName": "/src/crawlee/browsers/_browser_controller.py" + }, + "1080": { + "qualifiedName": "has_free_capacity", + "sourceFileName": "/src/crawlee/browsers/_browser_controller.py" + }, + "1081": { + "qualifiedName": "is_browser_connected", + "sourceFileName": "/src/crawlee/browsers/_browser_controller.py" + }, + "1082": { + "qualifiedName": "browser_type", + "sourceFileName": "/src/crawlee/browsers/_browser_controller.py" + }, + "1083": { + "qualifiedName": "new_page", + "sourceFileName": "/src/crawlee/browsers/_browser_controller.py" + }, + "1087": { + "qualifiedName": "close", + "sourceFileName": "/src/crawlee/browsers/_browser_controller.py" + }, + "1090": { + "qualifiedName": "BrowserPlugin", + "sourceFileName": "/src/crawlee/browsers/_browser_plugin.py" + }, + "1091": { + "qualifiedName": "AUTOMATION_LIBRARY", + "sourceFileName": "/src/crawlee/browsers/_browser_plugin.py" + }, + "1092": { + "qualifiedName": "active", + "sourceFileName": "/src/crawlee/browsers/_browser_plugin.py" + }, + "1093": { + "qualifiedName": "browser_type", + "sourceFileName": "/src/crawlee/browsers/_browser_plugin.py" + }, + "1094": { + "qualifiedName": "browser_launch_options", + "sourceFileName": "/src/crawlee/browsers/_browser_plugin.py" + }, + "1095": { + "qualifiedName": "browser_new_context_options", + "sourceFileName": "/src/crawlee/browsers/_browser_plugin.py" + }, + "1096": { + "qualifiedName": "max_open_pages_per_browser", + "sourceFileName": "/src/crawlee/browsers/_browser_plugin.py" + }, + "1097": { + "qualifiedName": "__aenter__", + "sourceFileName": "/src/crawlee/browsers/_browser_plugin.py" + }, + "1099": { + "qualifiedName": "__aexit__", + "sourceFileName": "/src/crawlee/browsers/_browser_plugin.py" + }, + "1104": { + "qualifiedName": "new_browser", + "sourceFileName": "/src/crawlee/browsers/_browser_plugin.py" + }, + "1106": { + "qualifiedName": "logger", + "sourceFileName": "/src/crawlee/browsers/_playwright_browser.py" + }, + "1107": { + "qualifiedName": "PlaywrightPersistentBrowser", + "sourceFileName": "/src/crawlee/browsers/_playwright_browser.py" + }, + "1108": { + "qualifiedName": "__init__", + "sourceFileName": "/src/crawlee/browsers/_playwright_browser.py" + }, + "1113": { + "qualifiedName": "browser_type", + "sourceFileName": "/src/crawlee/browsers/_playwright_browser.py" + }, + "1114": { + "qualifiedName": "contexts", + "sourceFileName": "/src/crawlee/browsers/_playwright_browser.py" + }, + "1115": { + "qualifiedName": "is_connected", + "sourceFileName": "/src/crawlee/browsers/_playwright_browser.py" + }, + "1117": { + "qualifiedName": "new_context", + "sourceFileName": "/src/crawlee/browsers/_playwright_browser.py" + }, + "1120": { + "qualifiedName": "close", + "sourceFileName": "/src/crawlee/browsers/_playwright_browser.py" + }, + "1123": { + "qualifiedName": "version", + "sourceFileName": "/src/crawlee/browsers/_playwright_browser.py" + }, + "1124": { + "qualifiedName": "new_page", + "sourceFileName": "/src/crawlee/browsers/_playwright_browser.py" + }, + "1127": { + "qualifiedName": "new_browser_cdp_session", + "sourceFileName": "/src/crawlee/browsers/_playwright_browser.py" + }, + "1129": { + "qualifiedName": "start_tracing", + "sourceFileName": "/src/crawlee/browsers/_playwright_browser.py" + }, + "1132": { + "qualifiedName": "stop_tracing", + "sourceFileName": "/src/crawlee/browsers/_playwright_browser.py" + }, + "1135": { + "qualifiedName": "BrowserType", + "sourceFileName": "/src/crawlee/browsers/_types.py" + }, + "1136": { + "qualifiedName": "CrawleePage", + "sourceFileName": "/src/crawlee/browsers/_types.py" + }, + "1137": { + "qualifiedName": "id", + "sourceFileName": "/src/crawlee/browsers/_types.py" + }, + "1138": { + "qualifiedName": "browser_type", + "sourceFileName": "/src/crawlee/browsers/_types.py" + }, + "1139": { + "qualifiedName": "page", + "sourceFileName": "/src/crawlee/browsers/_types.py" + }, + "1140": { + "qualifiedName": "logger", + "sourceFileName": "/src/crawlee/browsers/_playwright_browser_plugin.py" + }, + "1141": { + "qualifiedName": "PlaywrightBrowserPlugin", + "sourceFileName": "/src/crawlee/browsers/_playwright_browser_plugin.py" + }, + "1142": { + "qualifiedName": "AUTOMATION_LIBRARY", + "sourceFileName": "/src/crawlee/browsers/_playwright_browser_plugin.py" + }, + "1143": { + "qualifiedName": "__init__", + "sourceFileName": "/src/crawlee/browsers/_playwright_browser_plugin.py" + }, + "1152": { + "qualifiedName": "active", + "sourceFileName": "/src/crawlee/browsers/_playwright_browser_plugin.py" + }, + "1153": { + "qualifiedName": "browser_type", + "sourceFileName": "/src/crawlee/browsers/_playwright_browser_plugin.py" + }, + "1154": { + "qualifiedName": "browser_launch_options", + "sourceFileName": "/src/crawlee/browsers/_playwright_browser_plugin.py" + }, + "1155": { + "qualifiedName": "browser_new_context_options", + "sourceFileName": "/src/crawlee/browsers/_playwright_browser_plugin.py" + }, + "1156": { + "qualifiedName": "max_open_pages_per_browser", + "sourceFileName": "/src/crawlee/browsers/_playwright_browser_plugin.py" + }, + "1157": { + "qualifiedName": "__aenter__", + "sourceFileName": "/src/crawlee/browsers/_playwright_browser_plugin.py" + }, + "1159": { + "qualifiedName": "__aexit__", + "sourceFileName": "/src/crawlee/browsers/_playwright_browser_plugin.py" + }, + "1164": { + "qualifiedName": "new_browser", + "sourceFileName": "/src/crawlee/browsers/_playwright_browser_plugin.py" + }, + "1166": { + "qualifiedName": "logger", + "sourceFileName": "/src/crawlee/browsers/_browser_pool.py" + }, + "1167": { + "qualifiedName": "BrowserPool", + "sourceFileName": "/src/crawlee/browsers/_browser_pool.py" + }, + "1168": { + "qualifiedName": "__init__", + "sourceFileName": "/src/crawlee/browsers/_browser_pool.py" + }, + "1176": { + "qualifiedName": "with_default_plugin", + "sourceFileName": "/src/crawlee/browsers/_browser_pool.py" + }, + "1186": { + "qualifiedName": "plugins", + "sourceFileName": "/src/crawlee/browsers/_browser_pool.py" + }, + "1187": { + "qualifiedName": "active_browsers", + "sourceFileName": "/src/crawlee/browsers/_browser_pool.py" + }, + "1188": { + "qualifiedName": "inactive_browsers", + "sourceFileName": "/src/crawlee/browsers/_browser_pool.py" + }, + "1189": { + "qualifiedName": "pages", + "sourceFileName": "/src/crawlee/browsers/_browser_pool.py" + }, + "1190": { + "qualifiedName": "total_pages_count", + "sourceFileName": "/src/crawlee/browsers/_browser_pool.py" + }, + "1191": { + "qualifiedName": "active", + "sourceFileName": "/src/crawlee/browsers/_browser_pool.py" + }, + "1192": { + "qualifiedName": "__aenter__", + "sourceFileName": "/src/crawlee/browsers/_browser_pool.py" + }, + "1194": { + "qualifiedName": "__aexit__", + "sourceFileName": "/src/crawlee/browsers/_browser_pool.py" + }, + "1199": { + "qualifiedName": "new_page", + "sourceFileName": "/src/crawlee/browsers/_browser_pool.py" + }, + "1204": { + "qualifiedName": "new_page_with_each_plugin", + "sourceFileName": "/src/crawlee/browsers/_browser_pool.py" + }, + "1206": { + "qualifiedName": "pre_page_create_hook", + "sourceFileName": "/src/crawlee/browsers/_browser_pool.py" + }, + "1209": { + "qualifiedName": "post_page_create_hook", + "sourceFileName": "/src/crawlee/browsers/_browser_pool.py" + }, + "1212": { + "qualifiedName": "pre_page_close_hook", + "sourceFileName": "/src/crawlee/browsers/_browser_pool.py" + }, + "1215": { + "qualifiedName": "post_page_close_hook", + "sourceFileName": "/src/crawlee/browsers/_browser_pool.py" + }, + "1218": { + "qualifiedName": "logger", + "sourceFileName": "/src/crawlee/browsers/_playwright_browser_controller.py" + }, + "1219": { + "qualifiedName": "PlaywrightBrowserController", + "sourceFileName": "/src/crawlee/browsers/_playwright_browser_controller.py" + }, + "1220": { + "qualifiedName": "AUTOMATION_LIBRARY", + "sourceFileName": "/src/crawlee/browsers/_playwright_browser_controller.py" + }, + "1221": { + "qualifiedName": "__init__", + "sourceFileName": "/src/crawlee/browsers/_playwright_browser_controller.py" + }, + "1228": { + "qualifiedName": "pages", + "sourceFileName": "/src/crawlee/browsers/_playwright_browser_controller.py" + }, + "1229": { + "qualifiedName": "total_opened_pages", + "sourceFileName": "/src/crawlee/browsers/_playwright_browser_controller.py" + }, + "1230": { + "qualifiedName": "pages_count", + "sourceFileName": "/src/crawlee/browsers/_playwright_browser_controller.py" + }, + "1231": { + "qualifiedName": "last_page_opened_at", + "sourceFileName": "/src/crawlee/browsers/_playwright_browser_controller.py" + }, + "1232": { + "qualifiedName": "idle_time", + "sourceFileName": "/src/crawlee/browsers/_playwright_browser_controller.py" + }, + "1233": { + "qualifiedName": "has_free_capacity", + "sourceFileName": "/src/crawlee/browsers/_playwright_browser_controller.py" + }, + "1234": { + "qualifiedName": "is_browser_connected", + "sourceFileName": "/src/crawlee/browsers/_playwright_browser_controller.py" + }, + "1235": { + "qualifiedName": "browser_type", + "sourceFileName": "/src/crawlee/browsers/_playwright_browser_controller.py" + }, + "1236": { + "qualifiedName": "new_page", + "sourceFileName": "/src/crawlee/browsers/_playwright_browser_controller.py" + }, + "1240": { + "qualifiedName": "close", + "sourceFileName": "/src/crawlee/browsers/_playwright_browser_controller.py" + }, + "1243": { + "qualifiedName": "TParseResult", + "sourceFileName": "/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py" + }, + "1244": { + "qualifiedName": "TSelectResult", + "sourceFileName": "/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py" + }, + "1245": { + "qualifiedName": "HttpCrawlingContext", + "sourceFileName": "/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py" + }, + "1246": { + "qualifiedName": "from_basic_crawling_context", + "sourceFileName": "/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py" + }, + "1250": { + "qualifiedName": "get_snapshot", + "sourceFileName": "/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py" + }, + "1252": { + "qualifiedName": "ParsedHttpCrawlingContext", + "sourceFileName": "/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py" + }, + "1253": { + "qualifiedName": "parsed_content", + "sourceFileName": "/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py" + }, + "1254": { + "qualifiedName": "enqueue_links", + "sourceFileName": "/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py" + }, + "1255": { + "qualifiedName": "extract_links", + "sourceFileName": "/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py" + }, + "1256": { + "qualifiedName": "from_http_crawling_context", + "sourceFileName": "/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py" + }, + "1262": { + "qualifiedName": "TCrawlingContext", + "sourceFileName": "/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py" + }, + "1263": { + "qualifiedName": "TStatisticsState", + "sourceFileName": "/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py" + }, + "1264": { + "qualifiedName": "HttpCrawlerOptions", + "sourceFileName": "/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py" + }, + "1265": { + "qualifiedName": "navigation_timeout", + "sourceFileName": "/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py" + }, + "1266": { + "qualifiedName": "AbstractHttpCrawler", + "sourceFileName": "/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py" + }, + "1267": { + "qualifiedName": "__init__", + "sourceFileName": "/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py" + }, + "1272": { + "qualifiedName": "create_parsed_http_crawler_class", + "sourceFileName": "/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py" + }, + "1275": { + "qualifiedName": "pre_navigation_hook", + "sourceFileName": "/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py" + }, + "1278": { + "qualifiedName": "post_navigation_hook", + "sourceFileName": "/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py" + }, + "1281": { + "qualifiedName": "AbstractHttpParser", + "sourceFileName": "/src/crawlee/crawlers/_abstract_http/_abstract_http_parser.py" + }, + "1282": { + "qualifiedName": "parse", + "sourceFileName": "/src/crawlee/crawlers/_abstract_http/_abstract_http_parser.py" + }, + "1285": { + "qualifiedName": "parse_text", + "sourceFileName": "/src/crawlee/crawlers/_abstract_http/_abstract_http_parser.py" + }, + "1288": { + "qualifiedName": "select", + "sourceFileName": "/src/crawlee/crawlers/_abstract_http/_abstract_http_parser.py" + }, + "1292": { + "qualifiedName": "is_blocked", + "sourceFileName": "/src/crawlee/crawlers/_abstract_http/_abstract_http_parser.py" + }, + "1295": { + "qualifiedName": "is_matching_selector", + "sourceFileName": "/src/crawlee/crawlers/_abstract_http/_abstract_http_parser.py" + }, + "1299": { + "qualifiedName": "find_links", + "sourceFileName": "/src/crawlee/crawlers/_abstract_http/_abstract_http_parser.py" + }, + "1304": { + "qualifiedName": "AdaptivePlaywrightCrawlerStatisticState", + "sourceFileName": "/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py" + }, + "1305": { + "qualifiedName": "model_config", + "sourceFileName": "/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py" + }, + "1306": { + "qualifiedName": "http_only_request_handler_runs", + "sourceFileName": "/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py" + }, + "1307": { + "qualifiedName": "browser_request_handler_runs", + "sourceFileName": "/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py" + }, + "1308": { + "qualifiedName": "rendering_type_mispredictions", + "sourceFileName": "/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py" + }, + "1309": { + "qualifiedName": "logger", + "sourceFileName": "/src/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py" + }, + "1310": { + "qualifiedName": "UrlComponents", + "sourceFileName": "/src/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py" + }, + "1311": { + "qualifiedName": "RenderingType", + "sourceFileName": "/src/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py" + }, + "1312": { + "qualifiedName": "FeatureVector", + "sourceFileName": "/src/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py" + }, + "1313": { + "qualifiedName": "RenderingTypePredictorState", + "sourceFileName": "/src/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py" + }, + "1314": { + "qualifiedName": "model_config", + "sourceFileName": "/src/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py" + }, + "1315": { + "qualifiedName": "model", + "sourceFileName": "/src/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py" + }, + "1316": { + "qualifiedName": "labels_coefficients", + "sourceFileName": "/src/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py" + }, + "1317": { + "qualifiedName": "RenderingTypePrediction", + "sourceFileName": "/src/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py" + }, + "1318": { + "qualifiedName": "rendering_type", + "sourceFileName": "/src/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py" + }, + "1319": { + "qualifiedName": "detection_probability_recommendation", + "sourceFileName": "/src/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py" + }, + "1320": { + "qualifiedName": "RenderingTypePredictor", + "sourceFileName": "/src/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py" + }, + "1321": { + "qualifiedName": "__init__", + "sourceFileName": "/src/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py" + }, + "1323": { + "qualifiedName": "predict", + "sourceFileName": "/src/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py" + }, + "1326": { + "qualifiedName": "store_result", + "sourceFileName": "/src/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py" + }, + "1330": { + "qualifiedName": "initialize", + "sourceFileName": "/src/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py" + }, + "1332": { + "qualifiedName": "clear", + "sourceFileName": "/src/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py" + }, + "1334": { + "qualifiedName": "__aenter__", + "sourceFileName": "/src/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py" + }, + "1336": { + "qualifiedName": "__aexit__", + "sourceFileName": "/src/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py" + }, + "1341": { + "qualifiedName": "DefaultRenderingTypePredictor", + "sourceFileName": "/src/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py" + }, + "1342": { + "qualifiedName": "__init__", + "sourceFileName": "/src/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py" + }, + "1347": { + "qualifiedName": "initialize", + "sourceFileName": "/src/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py" + }, + "1349": { + "qualifiedName": "clear", + "sourceFileName": "/src/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py" + }, + "1351": { + "qualifiedName": "predict", + "sourceFileName": "/src/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py" + }, + "1354": { + "qualifiedName": "store_result", + "sourceFileName": "/src/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py" + }, + "1358": { + "qualifiedName": "get_url_components", + "sourceFileName": "/src/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py" + }, + "1361": { + "qualifiedName": "calculate_url_similarity", + "sourceFileName": "/src/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py" + }, + "1365": { + "qualifiedName": "create_default_comparator", + "sourceFileName": "/src/crawlee/crawlers/_adaptive_playwright/_result_comparator.py" + }, + "1368": { + "qualifiedName": "full_result_comparator", + "sourceFileName": "/src/crawlee/crawlers/_adaptive_playwright/_result_comparator.py" + }, + "1372": { + "qualifiedName": "push_data_only_comparator", + "sourceFileName": "/src/crawlee/crawlers/_adaptive_playwright/_result_comparator.py" + }, + "1376": { + "qualifiedName": "sklearn_model_validator", + "sourceFileName": "/src/crawlee/crawlers/_adaptive_playwright/_utils.py" + }, + "1379": { + "qualifiedName": "sklearn_model_serializer", + "sourceFileName": "/src/crawlee/crawlers/_adaptive_playwright/_utils.py" + }, + "1382": { + "qualifiedName": "TStaticParseResult", + "sourceFileName": "/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py" + }, + "1383": { + "qualifiedName": "TStaticSelectResult", + "sourceFileName": "/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py" + }, + "1384": { + "qualifiedName": "TStaticCrawlingContext", + "sourceFileName": "/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py" + }, + "1385": { + "qualifiedName": "_NonPersistentStatistics", + "sourceFileName": "/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py" + }, + "1386": { + "qualifiedName": "__init__", + "sourceFileName": "/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py" + }, + "1388": { + "qualifiedName": "__aenter__", + "sourceFileName": "/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py" + }, + "1390": { + "qualifiedName": "__aexit__", + "sourceFileName": "/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py" + }, + "1395": { + "qualifiedName": "AdaptivePlaywrightCrawler", + "sourceFileName": "/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py" + }, + "1396": { + "qualifiedName": "__init__", + "sourceFileName": "/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py" + }, + "1405": { + "qualifiedName": "with_beautifulsoup_static_parser", + "sourceFileName": "/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py" + }, + "1414": { + "qualifiedName": "with_parsel_static_parser", + "sourceFileName": "/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py" + }, + "1422": { + "qualifiedName": "pre_navigation_hook", + "sourceFileName": "/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py" + }, + "1426": { + "qualifiedName": "post_navigation_hook", + "sourceFileName": "/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py" + }, + "1430": { + "qualifiedName": "track_http_only_request_handler_runs", + "sourceFileName": "/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py" + }, + "1432": { + "qualifiedName": "track_browser_request_handler_runs", + "sourceFileName": "/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py" + }, + "1434": { + "qualifiedName": "track_rendering_type_mispredictions", + "sourceFileName": "/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py" + }, + "1436": { + "qualifiedName": "SubCrawlerRun", + "sourceFileName": "/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py" + }, + "1437": { + "qualifiedName": "result", + "sourceFileName": "/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py" + }, + "1438": { + "qualifiedName": "exception", + "sourceFileName": "/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py" + }, + "1439": { + "qualifiedName": "TStaticParseResult", + "sourceFileName": "/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py" + }, + "1440": { + "qualifiedName": "TStaticSelectResult", + "sourceFileName": "/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py" + }, + "1441": { + "qualifiedName": "AdaptiveContextError", + "sourceFileName": "/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py" + }, + "1442": { + "qualifiedName": "AdaptivePlaywrightCrawlingContext", + "sourceFileName": "/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py" + }, + "1443": { + "qualifiedName": "page", + "sourceFileName": "/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py" + }, + "1444": { + "qualifiedName": "infinite_scroll", + "sourceFileName": "/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py" + }, + "1445": { + "qualifiedName": "response", + "sourceFileName": "/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py" + }, + "1446": { + "qualifiedName": "wait_for_selector", + "sourceFileName": "/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py" + }, + "1450": { + "qualifiedName": "query_selector_one", + "sourceFileName": "/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py" + }, + "1454": { + "qualifiedName": "query_selector_all", + "sourceFileName": "/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py" + }, + "1458": { + "qualifiedName": "parse_with_static_parser", + "sourceFileName": "/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py" + }, + "1462": { + "qualifiedName": "from_parsed_http_crawling_context", + "sourceFileName": "/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py" + }, + "1466": { + "qualifiedName": "from_playwright_crawling_context", + "sourceFileName": "/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py" + }, + "1470": { + "qualifiedName": "AdaptivePlaywrightPreNavCrawlingContext", + "sourceFileName": "/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py" + }, + "1471": { + "qualifiedName": "block_requests", + "sourceFileName": "/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py" + }, + "1472": { + "qualifiedName": "goto_options", + "sourceFileName": "/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py" + }, + "1473": { + "qualifiedName": "page", + "sourceFileName": "/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py" + }, + "1474": { + "qualifiedName": "from_pre_navigation_context", + "sourceFileName": "/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py" + }, + "1477": { + "qualifiedName": "AdaptivePlaywrightPostNavCrawlingContext", + "sourceFileName": "/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py" + }, + "1478": { + "qualifiedName": "page", + "sourceFileName": "/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py" + }, + "1479": { + "qualifiedName": "response", + "sourceFileName": "/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py" + }, + "1480": { + "qualifiedName": "from_post_navigation_context", + "sourceFileName": "/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py" + }, + "1483": { + "qualifiedName": "TCrawlingContext", + "sourceFileName": "/src/crawlee/crawlers/_basic/_context_pipeline.py" + }, + "1484": { + "qualifiedName": "TMiddlewareCrawlingContext", + "sourceFileName": "/src/crawlee/crawlers/_basic/_context_pipeline.py" + }, + "1485": { + "qualifiedName": "_Middleware", + "sourceFileName": "/src/crawlee/crawlers/_basic/_context_pipeline.py" + }, + "1486": { + "qualifiedName": "__init__", + "sourceFileName": "/src/crawlee/crawlers/_basic/_context_pipeline.py" + }, + "1490": { + "qualifiedName": "action", + "sourceFileName": "/src/crawlee/crawlers/_basic/_context_pipeline.py" + }, + "1492": { + "qualifiedName": "cleanup", + "sourceFileName": "/src/crawlee/crawlers/_basic/_context_pipeline.py" + }, + "1495": { + "qualifiedName": "ContextPipeline", + "sourceFileName": "/src/crawlee/crawlers/_basic/_context_pipeline.py" + }, + "1496": { + "qualifiedName": "__init__", + "sourceFileName": "/src/crawlee/crawlers/_basic/_context_pipeline.py" + }, + "1500": { + "qualifiedName": "__call__", + "sourceFileName": "/src/crawlee/crawlers/_basic/_context_pipeline.py" + }, + "1504": { + "qualifiedName": "compose", + "sourceFileName": "/src/crawlee/crawlers/_basic/_context_pipeline.py" + }, + "1507": { + "qualifiedName": "swapped_context", + "sourceFileName": "/src/crawlee/crawlers/_basic/_context_utils.py" + }, + "1511": { + "qualifiedName": "reduce_asyncio_timeout_error_to_relevant_traceback_parts", + "sourceFileName": "/src/crawlee/crawlers/_basic/_logging_utils.py" + }, + "1514": { + "qualifiedName": "get_one_line_error_summary_if_possible", + "sourceFileName": "/src/crawlee/crawlers/_basic/_logging_utils.py" + }, + "1517": { + "qualifiedName": "TCrawlingContext", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "1518": { + "qualifiedName": "TStatisticsState", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "1519": { + "qualifiedName": "TRequestIterator", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "1520": { + "qualifiedName": "TParams", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "1521": { + "qualifiedName": "T", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "1522": { + "qualifiedName": "ErrorHandler", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "1523": { + "qualifiedName": "FailedRequestHandler", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "1524": { + "qualifiedName": "SkippedRequestCallback", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "1525": { + "qualifiedName": "_BasicCrawlerOptions", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "1526": { + "qualifiedName": "configuration", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "1527": { + "qualifiedName": "event_manager", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "1528": { + "qualifiedName": "storage_client", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "1529": { + "qualifiedName": "request_manager", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "1530": { + "qualifiedName": "session_pool", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "1531": { + "qualifiedName": "proxy_configuration", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "1532": { + "qualifiedName": "http_client", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "1533": { + "qualifiedName": "max_request_retries", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "1534": { + "qualifiedName": "max_requests_per_crawl", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "1535": { + "qualifiedName": "max_session_rotations", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "1536": { + "qualifiedName": "max_crawl_depth", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "1537": { + "qualifiedName": "use_session_pool", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "1538": { + "qualifiedName": "retry_on_blocked", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "1539": { + "qualifiedName": "concurrency_settings", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "1540": { + "qualifiedName": "request_handler_timeout", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "1541": { + "qualifiedName": "abort_on_error", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "1542": { + "qualifiedName": "configure_logging", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "1543": { + "qualifiedName": "statistics_log_format", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "1544": { + "qualifiedName": "keep_alive", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "1545": { + "qualifiedName": "additional_http_error_status_codes", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "1546": { + "qualifiedName": "ignore_http_error_status_codes", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "1547": { + "qualifiedName": "respect_robots_txt_file", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "1548": { + "qualifiedName": "status_message_logging_interval", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "1549": { + "qualifiedName": "status_message_callback", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "1550": { + "qualifiedName": "id", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "1551": { + "qualifiedName": "_BasicCrawlerOptionsGeneric", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "1552": { + "qualifiedName": "request_handler", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "1553": { + "qualifiedName": "statistics", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "1554": { + "qualifiedName": "BasicCrawlerOptions", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "1555": { + "qualifiedName": "BasicCrawler", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "1556": { + "qualifiedName": "__init__", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "1588": { + "qualifiedName": "log", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "1589": { + "qualifiedName": "router", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "1590": { + "qualifiedName": "router", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "1593": { + "qualifiedName": "statistics", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "1594": { + "qualifiedName": "stop", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "1597": { + "qualifiedName": "get_request_manager", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "1599": { + "qualifiedName": "get_dataset", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "1604": { + "qualifiedName": "get_key_value_store", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "1609": { + "qualifiedName": "error_handler", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "1612": { + "qualifiedName": "failed_request_handler", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "1615": { + "qualifiedName": "on_skipped_request", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "1618": { + "qualifiedName": "run", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "1622": { + "qualifiedName": "add_requests", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "1630": { + "qualifiedName": "use_state", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "1633": { + "qualifiedName": "get_data", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "1639": { + "qualifiedName": "export_data", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "1646": { + "qualifiedName": "BeautifulSoupCrawler", + "sourceFileName": "/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py" + }, + "1647": { + "qualifiedName": "__init__", + "sourceFileName": "/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py" + }, + "1651": { + "qualifiedName": "BeautifulSoupCrawlingContext", + "sourceFileName": "/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawling_context.py" + }, + "1652": { + "qualifiedName": "soup", + "sourceFileName": "/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawling_context.py" + }, + "1653": { + "qualifiedName": "from_parsed_http_crawling_context", + "sourceFileName": "/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawling_context.py" + }, + "1656": { + "qualifiedName": "html_to_text", + "sourceFileName": "/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawling_context.py" + }, + "1658": { + "qualifiedName": "html_to_text", + "sourceFileName": "/src/crawlee/crawlers/_beautifulsoup/_utils.py" + }, + "1661": { + "qualifiedName": "BeautifulSoupParser", + "sourceFileName": "/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py" + }, + "1662": { + "qualifiedName": "__init__", + "sourceFileName": "/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py" + }, + "1665": { + "qualifiedName": "parse", + "sourceFileName": "/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py" + }, + "1668": { + "qualifiedName": "parse_text", + "sourceFileName": "/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py" + }, + "1671": { + "qualifiedName": "is_matching_selector", + "sourceFileName": "/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py" + }, + "1675": { + "qualifiedName": "select", + "sourceFileName": "/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py" + }, + "1679": { + "qualifiedName": "find_links", + "sourceFileName": "/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py" + }, + "1684": { + "qualifiedName": "BeautifulSoupParserType", + "sourceFileName": "/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py" + }, + "1685": { + "qualifiedName": "HttpCrawler", + "sourceFileName": "/src/crawlee/crawlers/_http/_http_crawler.py" + }, + "1686": { + "qualifiedName": "__init__", + "sourceFileName": "/src/crawlee/crawlers/_http/_http_crawler.py" + }, + "1689": { + "qualifiedName": "NoParser", + "sourceFileName": "/src/crawlee/crawlers/_http/_http_parser.py" + }, + "1690": { + "qualifiedName": "parse", + "sourceFileName": "/src/crawlee/crawlers/_http/_http_parser.py" + }, + "1693": { + "qualifiedName": "parse_text", + "sourceFileName": "/src/crawlee/crawlers/_http/_http_parser.py" + }, + "1696": { + "qualifiedName": "select", + "sourceFileName": "/src/crawlee/crawlers/_http/_http_parser.py" + }, + "1700": { + "qualifiedName": "is_blocked", + "sourceFileName": "/src/crawlee/crawlers/_http/_http_parser.py" + }, + "1703": { + "qualifiedName": "is_matching_selector", + "sourceFileName": "/src/crawlee/crawlers/_http/_http_parser.py" + }, + "1707": { + "qualifiedName": "find_links", + "sourceFileName": "/src/crawlee/crawlers/_http/_http_parser.py" + }, + "1712": { + "qualifiedName": "ParselCrawler", + "sourceFileName": "/src/crawlee/crawlers/_parsel/_parsel_crawler.py" + }, + "1713": { + "qualifiedName": "__init__", + "sourceFileName": "/src/crawlee/crawlers/_parsel/_parsel_crawler.py" + }, + "1716": { + "qualifiedName": "ParselCrawlingContext", + "sourceFileName": "/src/crawlee/crawlers/_parsel/_parsel_crawling_context.py" + }, + "1717": { + "qualifiedName": "selector", + "sourceFileName": "/src/crawlee/crawlers/_parsel/_parsel_crawling_context.py" + }, + "1718": { + "qualifiedName": "from_parsed_http_crawling_context", + "sourceFileName": "/src/crawlee/crawlers/_parsel/_parsel_crawling_context.py" + }, + "1721": { + "qualifiedName": "html_to_text", + "sourceFileName": "/src/crawlee/crawlers/_parsel/_parsel_crawling_context.py" + }, + "1723": { + "qualifiedName": "html_to_text", + "sourceFileName": "/src/crawlee/crawlers/_parsel/_utils.py" + }, + "1726": { + "qualifiedName": "ParselParser", + "sourceFileName": "/src/crawlee/crawlers/_parsel/_parsel_parser.py" + }, + "1727": { + "qualifiedName": "parse", + "sourceFileName": "/src/crawlee/crawlers/_parsel/_parsel_parser.py" + }, + "1730": { + "qualifiedName": "parse_text", + "sourceFileName": "/src/crawlee/crawlers/_parsel/_parsel_parser.py" + }, + "1733": { + "qualifiedName": "select", + "sourceFileName": "/src/crawlee/crawlers/_parsel/_parsel_parser.py" + }, + "1737": { + "qualifiedName": "is_matching_selector", + "sourceFileName": "/src/crawlee/crawlers/_parsel/_parsel_parser.py" + }, + "1741": { + "qualifiedName": "find_links", + "sourceFileName": "/src/crawlee/crawlers/_parsel/_parsel_parser.py" + }, + "1746": { + "qualifiedName": "browser_page_context", + "sourceFileName": "/src/crawlee/crawlers/_playwright/_playwright_http_client.py" + }, + "1749": { + "qualifiedName": "PlaywrightHttpClient", + "sourceFileName": "/src/crawlee/crawlers/_playwright/_playwright_http_client.py" + }, + "1750": { + "qualifiedName": "__init__", + "sourceFileName": "/src/crawlee/crawlers/_playwright/_playwright_http_client.py" + }, + "1752": { + "qualifiedName": "crawl", + "sourceFileName": "/src/crawlee/crawlers/_playwright/_playwright_http_client.py" + }, + "1759": { + "qualifiedName": "send_request", + "sourceFileName": "/src/crawlee/crawlers/_playwright/_playwright_http_client.py" + }, + "1768": { + "qualifiedName": "stream", + "sourceFileName": "/src/crawlee/crawlers/_playwright/_playwright_http_client.py" + }, + "1777": { + "qualifiedName": "cleanup", + "sourceFileName": "/src/crawlee/crawlers/_playwright/_playwright_http_client.py" + }, + "1779": { + "qualifiedName": "PlaywrightPreNavCrawlingContext", + "sourceFileName": "/src/crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py" + }, + "1780": { + "qualifiedName": "page", + "sourceFileName": "/src/crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py" + }, + "1781": { + "qualifiedName": "block_requests", + "sourceFileName": "/src/crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py" + }, + "1782": { + "qualifiedName": "goto_options", + "sourceFileName": "/src/crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py" + }, + "1783": { + "qualifiedName": "get_snapshot", + "sourceFileName": "/src/crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py" + }, + "1785": { + "qualifiedName": "infinite_scroll", + "sourceFileName": "/src/crawlee/crawlers/_playwright/_utils.py" + }, + "1788": { + "qualifiedName": "block_requests", + "sourceFileName": "/src/crawlee/crawlers/_playwright/_utils.py" + }, + "1793": { + "qualifiedName": "BlockRequestsFunction", + "sourceFileName": "/src/crawlee/crawlers/_playwright/_types.py" + }, + "1794": { + "qualifiedName": "__call__", + "sourceFileName": "/src/crawlee/crawlers/_playwright/_types.py" + }, + "1798": { + "qualifiedName": "PlaywrightHttpResponse", + "sourceFileName": "/src/crawlee/crawlers/_playwright/_types.py" + }, + "1799": { + "qualifiedName": "http_version", + "sourceFileName": "/src/crawlee/crawlers/_playwright/_types.py" + }, + "1800": { + "qualifiedName": "status_code", + "sourceFileName": "/src/crawlee/crawlers/_playwright/_types.py" + }, + "1801": { + "qualifiedName": "headers", + "sourceFileName": "/src/crawlee/crawlers/_playwright/_types.py" + }, + "1802": { + "qualifiedName": "read", + "sourceFileName": "/src/crawlee/crawlers/_playwright/_types.py" + }, + "1804": { + "qualifiedName": "read_stream", + "sourceFileName": "/src/crawlee/crawlers/_playwright/_types.py" + }, + "1806": { + "qualifiedName": "from_playwright_response", + "sourceFileName": "/src/crawlee/crawlers/_playwright/_types.py" + }, + "1810": { + "qualifiedName": "GotoOptions", + "sourceFileName": "/src/crawlee/crawlers/_playwright/_types.py" + }, + "1811": { + "qualifiedName": "wait_until", + "sourceFileName": "/src/crawlee/crawlers/_playwright/_types.py" + }, + "1812": { + "qualifiedName": "referer", + "sourceFileName": "/src/crawlee/crawlers/_playwright/_types.py" + }, + "1813": { + "qualifiedName": "TCrawlingContext", + "sourceFileName": "/src/crawlee/crawlers/_playwright/_playwright_crawler.py" + }, + "1814": { + "qualifiedName": "TStatisticsState", + "sourceFileName": "/src/crawlee/crawlers/_playwright/_playwright_crawler.py" + }, + "1815": { + "qualifiedName": "PlaywrightCrawler", + "sourceFileName": "/src/crawlee/crawlers/_playwright/_playwright_crawler.py" + }, + "1816": { + "qualifiedName": "__init__", + "sourceFileName": "/src/crawlee/crawlers/_playwright/_playwright_crawler.py" + }, + "1829": { + "qualifiedName": "pre_navigation_hook", + "sourceFileName": "/src/crawlee/crawlers/_playwright/_playwright_crawler.py" + }, + "1832": { + "qualifiedName": "post_navigation_hook", + "sourceFileName": "/src/crawlee/crawlers/_playwright/_playwright_crawler.py" + }, + "1835": { + "qualifiedName": "_PlaywrightCrawlerAdditionalOptions", + "sourceFileName": "/src/crawlee/crawlers/_playwright/_playwright_crawler.py" + }, + "1836": { + "qualifiedName": "browser_pool", + "sourceFileName": "/src/crawlee/crawlers/_playwright/_playwright_crawler.py" + }, + "1837": { + "qualifiedName": "browser_type", + "sourceFileName": "/src/crawlee/crawlers/_playwright/_playwright_crawler.py" + }, + "1838": { + "qualifiedName": "browser_launch_options", + "sourceFileName": "/src/crawlee/crawlers/_playwright/_playwright_crawler.py" + }, + "1839": { + "qualifiedName": "browser_new_context_options", + "sourceFileName": "/src/crawlee/crawlers/_playwright/_playwright_crawler.py" + }, + "1840": { + "qualifiedName": "headless", + "sourceFileName": "/src/crawlee/crawlers/_playwright/_playwright_crawler.py" + }, + "1841": { + "qualifiedName": "PlaywrightCrawlerOptions", + "sourceFileName": "/src/crawlee/crawlers/_playwright/_playwright_crawler.py" + }, + "1842": { + "qualifiedName": "PlaywrightCrawlingContext", + "sourceFileName": "/src/crawlee/crawlers/_playwright/_playwright_crawling_context.py" + }, + "1843": { + "qualifiedName": "enqueue_links", + "sourceFileName": "/src/crawlee/crawlers/_playwright/_playwright_crawling_context.py" + }, + "1844": { + "qualifiedName": "extract_links", + "sourceFileName": "/src/crawlee/crawlers/_playwright/_playwright_crawling_context.py" + }, + "1845": { + "qualifiedName": "infinite_scroll", + "sourceFileName": "/src/crawlee/crawlers/_playwright/_playwright_crawling_context.py" + }, + "1846": { + "qualifiedName": "PlaywrightPostNavCrawlingContext", + "sourceFileName": "/src/crawlee/crawlers/_playwright/_playwright_post_nav_crawling_context.py" + }, + "1847": { + "qualifiedName": "response", + "sourceFileName": "/src/crawlee/crawlers/_playwright/_playwright_post_nav_crawling_context.py" + }, + "1848": { + "qualifiedName": "BlockedInfo", + "sourceFileName": "/src/crawlee/crawlers/_types.py" + }, + "1849": { + "qualifiedName": "reason", + "sourceFileName": "/src/crawlee/crawlers/_types.py" + }, + "1850": { + "qualifiedName": "__bool__", + "sourceFileName": "/src/crawlee/crawlers/_types.py" + }, + "1852": { + "qualifiedName": "logger", + "sourceFileName": "/src/crawlee/events/_local_event_manager.py" + }, + "1853": { + "qualifiedName": "LocalEventManager", + "sourceFileName": "/src/crawlee/events/_local_event_manager.py" + }, + "1854": { + "qualifiedName": "__init__", + "sourceFileName": "/src/crawlee/events/_local_event_manager.py" + }, + "1858": { + "qualifiedName": "from_config", + "sourceFileName": "/src/crawlee/events/_local_event_manager.py" + }, + "1861": { + "qualifiedName": "__aenter__", + "sourceFileName": "/src/crawlee/events/_local_event_manager.py" + }, + "1863": { + "qualifiedName": "__aexit__", + "sourceFileName": "/src/crawlee/events/_local_event_manager.py" + }, + "1868": { + "qualifiedName": "Event", + "sourceFileName": "/src/crawlee/events/_types.py" + }, + "1869": { + "qualifiedName": "PERSIST_STATE", + "sourceFileName": "/src/crawlee/events/_types.py" + }, + "1870": { + "qualifiedName": "SYSTEM_INFO", + "sourceFileName": "/src/crawlee/events/_types.py" + }, + "1871": { + "qualifiedName": "MIGRATING", + "sourceFileName": "/src/crawlee/events/_types.py" + }, + "1872": { + "qualifiedName": "ABORTING", + "sourceFileName": "/src/crawlee/events/_types.py" + }, + "1873": { + "qualifiedName": "EXIT", + "sourceFileName": "/src/crawlee/events/_types.py" + }, + "1874": { + "qualifiedName": "SESSION_RETIRED", + "sourceFileName": "/src/crawlee/events/_types.py" + }, + "1875": { + "qualifiedName": "BROWSER_LAUNCHED", + "sourceFileName": "/src/crawlee/events/_types.py" + }, + "1876": { + "qualifiedName": "BROWSER_RETIRED", + "sourceFileName": "/src/crawlee/events/_types.py" + }, + "1877": { + "qualifiedName": "BROWSER_CLOSED", + "sourceFileName": "/src/crawlee/events/_types.py" + }, + "1878": { + "qualifiedName": "PAGE_CREATED", + "sourceFileName": "/src/crawlee/events/_types.py" + }, + "1879": { + "qualifiedName": "PAGE_CLOSED", + "sourceFileName": "/src/crawlee/events/_types.py" + }, + "1880": { + "qualifiedName": "CRAWLER_STATUS", + "sourceFileName": "/src/crawlee/events/_types.py" + }, + "1881": { + "qualifiedName": "EventPersistStateData", + "sourceFileName": "/src/crawlee/events/_types.py" + }, + "1882": { + "qualifiedName": "model_config", + "sourceFileName": "/src/crawlee/events/_types.py" + }, + "1883": { + "qualifiedName": "is_migrating", + "sourceFileName": "/src/crawlee/events/_types.py" + }, + "1884": { + "qualifiedName": "EventSystemInfoData", + "sourceFileName": "/src/crawlee/events/_types.py" + }, + "1885": { + "qualifiedName": "model_config", + "sourceFileName": "/src/crawlee/events/_types.py" + }, + "1886": { + "qualifiedName": "cpu_info", + "sourceFileName": "/src/crawlee/events/_types.py" + }, + "1887": { + "qualifiedName": "memory_info", + "sourceFileName": "/src/crawlee/events/_types.py" + }, + "1888": { + "qualifiedName": "EventMigratingData", + "sourceFileName": "/src/crawlee/events/_types.py" + }, + "1889": { + "qualifiedName": "model_config", + "sourceFileName": "/src/crawlee/events/_types.py" + }, + "1890": { + "qualifiedName": "time_remaining", + "sourceFileName": "/src/crawlee/events/_types.py" + }, + "1891": { + "qualifiedName": "EventAbortingData", + "sourceFileName": "/src/crawlee/events/_types.py" + }, + "1892": { + "qualifiedName": "model_config", + "sourceFileName": "/src/crawlee/events/_types.py" + }, + "1893": { + "qualifiedName": "EventExitData", + "sourceFileName": "/src/crawlee/events/_types.py" + }, + "1894": { + "qualifiedName": "model_config", + "sourceFileName": "/src/crawlee/events/_types.py" + }, + "1895": { + "qualifiedName": "EventCrawlerStatusData", + "sourceFileName": "/src/crawlee/events/_types.py" + }, + "1896": { + "qualifiedName": "model_config", + "sourceFileName": "/src/crawlee/events/_types.py" + }, + "1897": { + "qualifiedName": "message", + "sourceFileName": "/src/crawlee/events/_types.py" + }, + "1898": { + "qualifiedName": "crawler_id", + "sourceFileName": "/src/crawlee/events/_types.py" + }, + "1899": { + "qualifiedName": "EventData", + "sourceFileName": "/src/crawlee/events/_types.py" + }, + "1900": { + "qualifiedName": "WrappedListener", + "sourceFileName": "/src/crawlee/events/_types.py" + }, + "1901": { + "qualifiedName": "TEvent", + "sourceFileName": "/src/crawlee/events/_types.py" + }, + "1902": { + "qualifiedName": "EventListener", + "sourceFileName": "/src/crawlee/events/_types.py" + }, + "1903": { + "qualifiedName": "logger", + "sourceFileName": "/src/crawlee/events/_event_manager.py" + }, + "1904": { + "qualifiedName": "EventManagerOptions", + "sourceFileName": "/src/crawlee/events/_event_manager.py" + }, + "1905": { + "qualifiedName": "persist_state_interval", + "sourceFileName": "/src/crawlee/events/_event_manager.py" + }, + "1906": { + "qualifiedName": "close_timeout", + "sourceFileName": "/src/crawlee/events/_event_manager.py" + }, + "1907": { + "qualifiedName": "EventManager", + "sourceFileName": "/src/crawlee/events/_event_manager.py" + }, + "1908": { + "qualifiedName": "__init__", + "sourceFileName": "/src/crawlee/events/_event_manager.py" + }, + "1912": { + "qualifiedName": "active", + "sourceFileName": "/src/crawlee/events/_event_manager.py" + }, + "1913": { + "qualifiedName": "__aenter__", + "sourceFileName": "/src/crawlee/events/_event_manager.py" + }, + "1915": { + "qualifiedName": "__aexit__", + "sourceFileName": "/src/crawlee/events/_event_manager.py" + }, + "1920": { + "qualifiedName": "on", + "sourceFileName": "/src/crawlee/events/_event_manager.py" + }, + "1924": { + "qualifiedName": "off", + "sourceFileName": "/src/crawlee/events/_event_manager.py" + }, + "1928": { + "qualifiedName": "emit", + "sourceFileName": "/src/crawlee/events/_event_manager.py" + }, + "1932": { + "qualifiedName": "wait_for_all_listeners_to_complete", + "sourceFileName": "/src/crawlee/events/_event_manager.py" + }, + "1977": { + "qualifiedName": "COMMON_ACCEPT_LANGUAGE", + "sourceFileName": "/src/crawlee/fingerprint_suite/_consts.py" + }, + "1978": { + "qualifiedName": "BROWSER_TYPE_HEADER_KEYWORD", + "sourceFileName": "/src/crawlee/fingerprint_suite/_consts.py" + }, + "1979": { + "qualifiedName": "FingerprintGenerator", + "sourceFileName": "/src/crawlee/fingerprint_suite/_fingerprint_generator.py" + }, + "1980": { + "qualifiedName": "generate", + "sourceFileName": "/src/crawlee/fingerprint_suite/_fingerprint_generator.py" + }, + "1982": { + "qualifiedName": "fingerprint_browser_type_from_playwright_browser_type", + "sourceFileName": "/src/crawlee/fingerprint_suite/_header_generator.py" + }, + "1985": { + "qualifiedName": "HeaderGenerator", + "sourceFileName": "/src/crawlee/fingerprint_suite/_header_generator.py" + }, + "1986": { + "qualifiedName": "__init__", + "sourceFileName": "/src/crawlee/fingerprint_suite/_header_generator.py" + }, + "1988": { + "qualifiedName": "get_specific_headers", + "sourceFileName": "/src/crawlee/fingerprint_suite/_header_generator.py" + }, + "1992": { + "qualifiedName": "get_common_headers", + "sourceFileName": "/src/crawlee/fingerprint_suite/_header_generator.py" + }, + "1994": { + "qualifiedName": "get_random_user_agent_header", + "sourceFileName": "/src/crawlee/fingerprint_suite/_header_generator.py" + }, + "1996": { + "qualifiedName": "get_user_agent_header", + "sourceFileName": "/src/crawlee/fingerprint_suite/_header_generator.py" + }, + "1999": { + "qualifiedName": "get_sec_ch_ua_headers", + "sourceFileName": "/src/crawlee/fingerprint_suite/_header_generator.py" + }, + "2002": { + "qualifiedName": "SupportedOperatingSystems", + "sourceFileName": "/src/crawlee/fingerprint_suite/_types.py" + }, + "2003": { + "qualifiedName": "SupportedDevices", + "sourceFileName": "/src/crawlee/fingerprint_suite/_types.py" + }, + "2004": { + "qualifiedName": "SupportedHttpVersion", + "sourceFileName": "/src/crawlee/fingerprint_suite/_types.py" + }, + "2005": { + "qualifiedName": "SupportedBrowserType", + "sourceFileName": "/src/crawlee/fingerprint_suite/_types.py" + }, + "2006": { + "qualifiedName": "ScreenOptions", + "sourceFileName": "/src/crawlee/fingerprint_suite/_types.py" + }, + "2007": { + "qualifiedName": "model_config", + "sourceFileName": "/src/crawlee/fingerprint_suite/_types.py" + }, + "2008": { + "qualifiedName": "min_width", + "sourceFileName": "/src/crawlee/fingerprint_suite/_types.py" + }, + "2009": { + "qualifiedName": "max_width", + "sourceFileName": "/src/crawlee/fingerprint_suite/_types.py" + }, + "2010": { + "qualifiedName": "min_height", + "sourceFileName": "/src/crawlee/fingerprint_suite/_types.py" + }, + "2011": { + "qualifiedName": "max_height", + "sourceFileName": "/src/crawlee/fingerprint_suite/_types.py" + }, + "2012": { + "qualifiedName": "HeaderGeneratorOptions", + "sourceFileName": "/src/crawlee/fingerprint_suite/_types.py" + }, + "2013": { + "qualifiedName": "model_config", + "sourceFileName": "/src/crawlee/fingerprint_suite/_types.py" + }, + "2014": { + "qualifiedName": "browsers", + "sourceFileName": "/src/crawlee/fingerprint_suite/_types.py" + }, + "2015": { + "qualifiedName": "operating_systems", + "sourceFileName": "/src/crawlee/fingerprint_suite/_types.py" + }, + "2016": { + "qualifiedName": "devices", + "sourceFileName": "/src/crawlee/fingerprint_suite/_types.py" + }, + "2017": { + "qualifiedName": "locales", + "sourceFileName": "/src/crawlee/fingerprint_suite/_types.py" + }, + "2018": { + "qualifiedName": "http_version", + "sourceFileName": "/src/crawlee/fingerprint_suite/_types.py" + }, + "2019": { + "qualifiedName": "strict", + "sourceFileName": "/src/crawlee/fingerprint_suite/_types.py" + }, + "2020": { + "qualifiedName": "PatchedHeaderGenerator", + "sourceFileName": "/src/crawlee/fingerprint_suite/_browserforge_adapter.py" + }, + "2021": { + "qualifiedName": "generate", + "sourceFileName": "/src/crawlee/fingerprint_suite/_browserforge_adapter.py" + }, + "2031": { + "qualifiedName": "PatchedFingerprintGenerator", + "sourceFileName": "/src/crawlee/fingerprint_suite/_browserforge_adapter.py" + }, + "2032": { + "qualifiedName": "__init__", + "sourceFileName": "/src/crawlee/fingerprint_suite/_browserforge_adapter.py" + }, + "2039": { + "qualifiedName": "BrowserforgeFingerprintGenerator", + "sourceFileName": "/src/crawlee/fingerprint_suite/_browserforge_adapter.py" + }, + "2040": { + "qualifiedName": "__init__", + "sourceFileName": "/src/crawlee/fingerprint_suite/_browserforge_adapter.py" + }, + "2046": { + "qualifiedName": "generate", + "sourceFileName": "/src/crawlee/fingerprint_suite/_browserforge_adapter.py" + }, + "2048": { + "qualifiedName": "BrowserforgeHeaderGenerator", + "sourceFileName": "/src/crawlee/fingerprint_suite/_browserforge_adapter.py" + }, + "2049": { + "qualifiedName": "__init__", + "sourceFileName": "/src/crawlee/fingerprint_suite/_browserforge_adapter.py" + }, + "2051": { + "qualifiedName": "generate", + "sourceFileName": "/src/crawlee/fingerprint_suite/_browserforge_adapter.py" + }, + "2054": { + "qualifiedName": "get_available_header_network", + "sourceFileName": "/src/crawlee/fingerprint_suite/_browserforge_adapter.py" + }, + "2056": { + "qualifiedName": "get_available_header_values", + "sourceFileName": "/src/crawlee/fingerprint_suite/_browserforge_adapter.py" + }, + "2060": { + "qualifiedName": "HttpResponse", + "sourceFileName": "/src/crawlee/http_clients/_base.py" + }, + "2061": { + "qualifiedName": "http_version", + "sourceFileName": "/src/crawlee/http_clients/_base.py" + }, + "2062": { + "qualifiedName": "status_code", + "sourceFileName": "/src/crawlee/http_clients/_base.py" + }, + "2063": { + "qualifiedName": "headers", + "sourceFileName": "/src/crawlee/http_clients/_base.py" + }, + "2064": { + "qualifiedName": "read", + "sourceFileName": "/src/crawlee/http_clients/_base.py" + }, + "2066": { + "qualifiedName": "read_stream", + "sourceFileName": "/src/crawlee/http_clients/_base.py" + }, + "2068": { + "qualifiedName": "HttpCrawlingResult", + "sourceFileName": "/src/crawlee/http_clients/_base.py" + }, + "2069": { + "qualifiedName": "http_response", + "sourceFileName": "/src/crawlee/http_clients/_base.py" + }, + "2070": { + "qualifiedName": "HttpClient", + "sourceFileName": "/src/crawlee/http_clients/_base.py" + }, + "2071": { + "qualifiedName": "__init__", + "sourceFileName": "/src/crawlee/http_clients/_base.py" + }, + "2074": { + "qualifiedName": "active", + "sourceFileName": "/src/crawlee/http_clients/_base.py" + }, + "2075": { + "qualifiedName": "crawl", + "sourceFileName": "/src/crawlee/http_clients/_base.py" + }, + "2082": { + "qualifiedName": "send_request", + "sourceFileName": "/src/crawlee/http_clients/_base.py" + }, + "2091": { + "qualifiedName": "stream", + "sourceFileName": "/src/crawlee/http_clients/_base.py" + }, + "2100": { + "qualifiedName": "cleanup", + "sourceFileName": "/src/crawlee/http_clients/_base.py" + }, + "2102": { + "qualifiedName": "__aenter__", + "sourceFileName": "/src/crawlee/http_clients/_base.py" + }, + "2104": { + "qualifiedName": "__aexit__", + "sourceFileName": "/src/crawlee/http_clients/_base.py" + }, + "2109": { + "qualifiedName": "_EmptyCookies", + "sourceFileName": "/src/crawlee/http_clients/_curl_impersonate.py" + }, + "2110": { + "qualifiedName": "get_cookies_for_curl", + "sourceFileName": "/src/crawlee/http_clients/_curl_impersonate.py" + }, + "2113": { + "qualifiedName": "update_cookies_from_curl", + "sourceFileName": "/src/crawlee/http_clients/_curl_impersonate.py" + }, + "2116": { + "qualifiedName": "_AsyncSession", + "sourceFileName": "/src/crawlee/http_clients/_curl_impersonate.py" + }, + "2117": { + "qualifiedName": "__init__", + "sourceFileName": "/src/crawlee/http_clients/_curl_impersonate.py" + }, + "2121": { + "qualifiedName": "_CurlImpersonateResponse", + "sourceFileName": "/src/crawlee/http_clients/_curl_impersonate.py" + }, + "2122": { + "qualifiedName": "__init__", + "sourceFileName": "/src/crawlee/http_clients/_curl_impersonate.py" + }, + "2125": { + "qualifiedName": "http_version", + "sourceFileName": "/src/crawlee/http_clients/_curl_impersonate.py" + }, + "2126": { + "qualifiedName": "status_code", + "sourceFileName": "/src/crawlee/http_clients/_curl_impersonate.py" + }, + "2127": { + "qualifiedName": "headers", + "sourceFileName": "/src/crawlee/http_clients/_curl_impersonate.py" + }, + "2128": { + "qualifiedName": "read", + "sourceFileName": "/src/crawlee/http_clients/_curl_impersonate.py" + }, + "2130": { + "qualifiedName": "read_stream", + "sourceFileName": "/src/crawlee/http_clients/_curl_impersonate.py" + }, + "2132": { + "qualifiedName": "CurlImpersonateHttpClient", + "sourceFileName": "/src/crawlee/http_clients/_curl_impersonate.py" + }, + "2133": { + "qualifiedName": "__init__", + "sourceFileName": "/src/crawlee/http_clients/_curl_impersonate.py" + }, + "2137": { + "qualifiedName": "crawl", + "sourceFileName": "/src/crawlee/http_clients/_curl_impersonate.py" + }, + "2144": { + "qualifiedName": "send_request", + "sourceFileName": "/src/crawlee/http_clients/_curl_impersonate.py" + }, + "2153": { + "qualifiedName": "stream", + "sourceFileName": "/src/crawlee/http_clients/_curl_impersonate.py" + }, + "2162": { + "qualifiedName": "cleanup", + "sourceFileName": "/src/crawlee/http_clients/_curl_impersonate.py" + }, + "2164": { + "qualifiedName": "logger", + "sourceFileName": "/src/crawlee/http_clients/_impit.py" + }, + "2165": { + "qualifiedName": "_ClientCacheEntry", + "sourceFileName": "/src/crawlee/http_clients/_impit.py" + }, + "2166": { + "qualifiedName": "client", + "sourceFileName": "/src/crawlee/http_clients/_impit.py" + }, + "2167": { + "qualifiedName": "cookie_jar", + "sourceFileName": "/src/crawlee/http_clients/_impit.py" + }, + "2168": { + "qualifiedName": "_ImpitResponse", + "sourceFileName": "/src/crawlee/http_clients/_impit.py" + }, + "2169": { + "qualifiedName": "__init__", + "sourceFileName": "/src/crawlee/http_clients/_impit.py" + }, + "2172": { + "qualifiedName": "http_version", + "sourceFileName": "/src/crawlee/http_clients/_impit.py" + }, + "2173": { + "qualifiedName": "status_code", + "sourceFileName": "/src/crawlee/http_clients/_impit.py" + }, + "2174": { + "qualifiedName": "headers", + "sourceFileName": "/src/crawlee/http_clients/_impit.py" + }, + "2175": { + "qualifiedName": "read", + "sourceFileName": "/src/crawlee/http_clients/_impit.py" + }, + "2177": { + "qualifiedName": "read_stream", + "sourceFileName": "/src/crawlee/http_clients/_impit.py" + }, + "2179": { + "qualifiedName": "ImpitHttpClient", + "sourceFileName": "/src/crawlee/http_clients/_impit.py" + }, + "2180": { + "qualifiedName": "__init__", + "sourceFileName": "/src/crawlee/http_clients/_impit.py" + }, + "2187": { + "qualifiedName": "crawl", + "sourceFileName": "/src/crawlee/http_clients/_impit.py" + }, + "2194": { + "qualifiedName": "send_request", + "sourceFileName": "/src/crawlee/http_clients/_impit.py" + }, + "2203": { + "qualifiedName": "stream", + "sourceFileName": "/src/crawlee/http_clients/_impit.py" + }, + "2212": { + "qualifiedName": "cleanup", + "sourceFileName": "/src/crawlee/http_clients/_impit.py" + }, + "2214": { + "qualifiedName": "logger", + "sourceFileName": "/src/crawlee/http_clients/_httpx.py" + }, + "2215": { + "qualifiedName": "_HttpxResponse", + "sourceFileName": "/src/crawlee/http_clients/_httpx.py" + }, + "2216": { + "qualifiedName": "__init__", + "sourceFileName": "/src/crawlee/http_clients/_httpx.py" + }, + "2219": { + "qualifiedName": "http_version", + "sourceFileName": "/src/crawlee/http_clients/_httpx.py" + }, + "2220": { + "qualifiedName": "status_code", + "sourceFileName": "/src/crawlee/http_clients/_httpx.py" + }, + "2221": { + "qualifiedName": "headers", + "sourceFileName": "/src/crawlee/http_clients/_httpx.py" + }, + "2222": { + "qualifiedName": "read", + "sourceFileName": "/src/crawlee/http_clients/_httpx.py" + }, + "2224": { + "qualifiedName": "read_stream", + "sourceFileName": "/src/crawlee/http_clients/_httpx.py" + }, + "2226": { + "qualifiedName": "_HttpxTransport", + "sourceFileName": "/src/crawlee/http_clients/_httpx.py" + }, + "2227": { + "qualifiedName": "handle_async_request", + "sourceFileName": "/src/crawlee/http_clients/_httpx.py" + }, + "2230": { + "qualifiedName": "HttpxHttpClient", + "sourceFileName": "/src/crawlee/http_clients/_httpx.py" + }, + "2231": { + "qualifiedName": "__init__", + "sourceFileName": "/src/crawlee/http_clients/_httpx.py" + }, + "2239": { + "qualifiedName": "crawl", + "sourceFileName": "/src/crawlee/http_clients/_httpx.py" + }, + "2246": { + "qualifiedName": "send_request", + "sourceFileName": "/src/crawlee/http_clients/_httpx.py" + }, + "2255": { + "qualifiedName": "stream", + "sourceFileName": "/src/crawlee/http_clients/_httpx.py" + }, + "2264": { + "qualifiedName": "cleanup", + "sourceFileName": "/src/crawlee/http_clients/_httpx.py" + }, + "2266": { + "qualifiedName": "CrawlerInstrumentor", + "sourceFileName": "/src/crawlee/otel/crawler_instrumentor.py" + }, + "2267": { + "qualifiedName": "__init__", + "sourceFileName": "/src/crawlee/otel/crawler_instrumentor.py" + }, + "2271": { + "qualifiedName": "instrumentation_dependencies", + "sourceFileName": "/src/crawlee/otel/crawler_instrumentor.py" + }, + "2273": { + "qualifiedName": "RequestLoader", + "sourceFileName": "/src/crawlee/request_loaders/_request_loader.py" + }, + "2274": { + "qualifiedName": "get_handled_count", + "sourceFileName": "/src/crawlee/request_loaders/_request_loader.py" + }, + "2276": { + "qualifiedName": "get_total_count", + "sourceFileName": "/src/crawlee/request_loaders/_request_loader.py" + }, + "2278": { + "qualifiedName": "is_empty", + "sourceFileName": "/src/crawlee/request_loaders/_request_loader.py" + }, + "2280": { + "qualifiedName": "is_finished", + "sourceFileName": "/src/crawlee/request_loaders/_request_loader.py" + }, + "2282": { + "qualifiedName": "fetch_next_request", + "sourceFileName": "/src/crawlee/request_loaders/_request_loader.py" + }, + "2284": { + "qualifiedName": "mark_request_as_handled", + "sourceFileName": "/src/crawlee/request_loaders/_request_loader.py" + }, + "2287": { + "qualifiedName": "to_tandem", + "sourceFileName": "/src/crawlee/request_loaders/_request_loader.py" + }, + "2290": { + "qualifiedName": "logger", + "sourceFileName": "/src/crawlee/request_loaders/_sitemap_request_loader.py" + }, + "2291": { + "qualifiedName": "SitemapRequestLoaderState", + "sourceFileName": "/src/crawlee/request_loaders/_sitemap_request_loader.py" + }, + "2292": { + "qualifiedName": "model_config", + "sourceFileName": "/src/crawlee/request_loaders/_sitemap_request_loader.py" + }, + "2293": { + "qualifiedName": "url_queue", + "sourceFileName": "/src/crawlee/request_loaders/_sitemap_request_loader.py" + }, + "2294": { + "qualifiedName": "in_progress", + "sourceFileName": "/src/crawlee/request_loaders/_sitemap_request_loader.py" + }, + "2295": { + "qualifiedName": "pending_sitemap_urls", + "sourceFileName": "/src/crawlee/request_loaders/_sitemap_request_loader.py" + }, + "2296": { + "qualifiedName": "in_progress_sitemap_url", + "sourceFileName": "/src/crawlee/request_loaders/_sitemap_request_loader.py" + }, + "2297": { + "qualifiedName": "current_sitemap_processed_urls", + "sourceFileName": "/src/crawlee/request_loaders/_sitemap_request_loader.py" + }, + "2298": { + "qualifiedName": "processed_sitemap_urls", + "sourceFileName": "/src/crawlee/request_loaders/_sitemap_request_loader.py" + }, + "2299": { + "qualifiedName": "completed", + "sourceFileName": "/src/crawlee/request_loaders/_sitemap_request_loader.py" + }, + "2300": { + "qualifiedName": "total_count", + "sourceFileName": "/src/crawlee/request_loaders/_sitemap_request_loader.py" + }, + "2301": { + "qualifiedName": "handled_count", + "sourceFileName": "/src/crawlee/request_loaders/_sitemap_request_loader.py" + }, + "2302": { + "qualifiedName": "SitemapRequestLoader", + "sourceFileName": "/src/crawlee/request_loaders/_sitemap_request_loader.py" + }, + "2303": { + "qualifiedName": "__init__", + "sourceFileName": "/src/crawlee/request_loaders/_sitemap_request_loader.py" + }, + "2313": { + "qualifiedName": "get_total_count", + "sourceFileName": "/src/crawlee/request_loaders/_sitemap_request_loader.py" + }, + "2315": { + "qualifiedName": "get_handled_count", + "sourceFileName": "/src/crawlee/request_loaders/_sitemap_request_loader.py" + }, + "2317": { + "qualifiedName": "is_empty", + "sourceFileName": "/src/crawlee/request_loaders/_sitemap_request_loader.py" + }, + "2319": { + "qualifiedName": "is_finished", + "sourceFileName": "/src/crawlee/request_loaders/_sitemap_request_loader.py" + }, + "2321": { + "qualifiedName": "fetch_next_request", + "sourceFileName": "/src/crawlee/request_loaders/_sitemap_request_loader.py" + }, + "2323": { + "qualifiedName": "mark_request_as_handled", + "sourceFileName": "/src/crawlee/request_loaders/_sitemap_request_loader.py" + }, + "2326": { + "qualifiedName": "abort_loading", + "sourceFileName": "/src/crawlee/request_loaders/_sitemap_request_loader.py" + }, + "2328": { + "qualifiedName": "start", + "sourceFileName": "/src/crawlee/request_loaders/_sitemap_request_loader.py" + }, + "2330": { + "qualifiedName": "close", + "sourceFileName": "/src/crawlee/request_loaders/_sitemap_request_loader.py" + }, + "2332": { + "qualifiedName": "__aenter__", + "sourceFileName": "/src/crawlee/request_loaders/_sitemap_request_loader.py" + }, + "2334": { + "qualifiedName": "__aexit__", + "sourceFileName": "/src/crawlee/request_loaders/_sitemap_request_loader.py" + }, + "2339": { + "qualifiedName": "RequestManager", + "sourceFileName": "/src/crawlee/request_loaders/_request_manager.py" + }, + "2340": { + "qualifiedName": "drop", + "sourceFileName": "/src/crawlee/request_loaders/_request_manager.py" + }, + "2342": { + "qualifiedName": "add_request", + "sourceFileName": "/src/crawlee/request_loaders/_request_manager.py" + }, + "2346": { + "qualifiedName": "add_requests", + "sourceFileName": "/src/crawlee/request_loaders/_request_manager.py" + }, + "2354": { + "qualifiedName": "reclaim_request", + "sourceFileName": "/src/crawlee/request_loaders/_request_manager.py" + }, + "2358": { + "qualifiedName": "logger", + "sourceFileName": "/src/crawlee/request_loaders/_request_manager_tandem.py" + }, + "2359": { + "qualifiedName": "RequestManagerTandem", + "sourceFileName": "/src/crawlee/request_loaders/_request_manager_tandem.py" + }, + "2360": { + "qualifiedName": "__init__", + "sourceFileName": "/src/crawlee/request_loaders/_request_manager_tandem.py" + }, + "2364": { + "qualifiedName": "get_handled_count", + "sourceFileName": "/src/crawlee/request_loaders/_request_manager_tandem.py" + }, + "2366": { + "qualifiedName": "get_total_count", + "sourceFileName": "/src/crawlee/request_loaders/_request_manager_tandem.py" + }, + "2368": { + "qualifiedName": "is_empty", + "sourceFileName": "/src/crawlee/request_loaders/_request_manager_tandem.py" + }, + "2370": { + "qualifiedName": "is_finished", + "sourceFileName": "/src/crawlee/request_loaders/_request_manager_tandem.py" + }, + "2372": { + "qualifiedName": "add_request", + "sourceFileName": "/src/crawlee/request_loaders/_request_manager_tandem.py" + }, + "2376": { + "qualifiedName": "add_requests", + "sourceFileName": "/src/crawlee/request_loaders/_request_manager_tandem.py" + }, + "2384": { + "qualifiedName": "fetch_next_request", + "sourceFileName": "/src/crawlee/request_loaders/_request_manager_tandem.py" + }, + "2386": { + "qualifiedName": "reclaim_request", + "sourceFileName": "/src/crawlee/request_loaders/_request_manager_tandem.py" + }, + "2390": { + "qualifiedName": "mark_request_as_handled", + "sourceFileName": "/src/crawlee/request_loaders/_request_manager_tandem.py" + }, + "2393": { + "qualifiedName": "drop", + "sourceFileName": "/src/crawlee/request_loaders/_request_manager_tandem.py" + }, + "2395": { + "qualifiedName": "logger", + "sourceFileName": "/src/crawlee/request_loaders/_request_list.py" + }, + "2396": { + "qualifiedName": "RequestListState", + "sourceFileName": "/src/crawlee/request_loaders/_request_list.py" + }, + "2397": { + "qualifiedName": "model_config", + "sourceFileName": "/src/crawlee/request_loaders/_request_list.py" + }, + "2398": { + "qualifiedName": "next_index", + "sourceFileName": "/src/crawlee/request_loaders/_request_list.py" + }, + "2399": { + "qualifiedName": "next_unique_key", + "sourceFileName": "/src/crawlee/request_loaders/_request_list.py" + }, + "2400": { + "qualifiedName": "in_progress", + "sourceFileName": "/src/crawlee/request_loaders/_request_list.py" + }, + "2401": { + "qualifiedName": "RequestListData", + "sourceFileName": "/src/crawlee/request_loaders/_request_list.py" + }, + "2402": { + "qualifiedName": "requests", + "sourceFileName": "/src/crawlee/request_loaders/_request_list.py" + }, + "2403": { + "qualifiedName": "RequestList", + "sourceFileName": "/src/crawlee/request_loaders/_request_list.py" + }, + "2404": { + "qualifiedName": "__init__", + "sourceFileName": "/src/crawlee/request_loaders/_request_list.py" + }, + "2410": { + "qualifiedName": "name", + "sourceFileName": "/src/crawlee/request_loaders/_request_list.py" + }, + "2411": { + "qualifiedName": "get_handled_count", + "sourceFileName": "/src/crawlee/request_loaders/_request_list.py" + }, + "2413": { + "qualifiedName": "get_total_count", + "sourceFileName": "/src/crawlee/request_loaders/_request_list.py" + }, + "2415": { + "qualifiedName": "is_empty", + "sourceFileName": "/src/crawlee/request_loaders/_request_list.py" + }, + "2417": { + "qualifiedName": "is_finished", + "sourceFileName": "/src/crawlee/request_loaders/_request_list.py" + }, + "2419": { + "qualifiedName": "fetch_next_request", + "sourceFileName": "/src/crawlee/request_loaders/_request_list.py" + }, + "2421": { + "qualifiedName": "mark_request_as_handled", + "sourceFileName": "/src/crawlee/request_loaders/_request_list.py" + }, + "2424": { + "qualifiedName": "SessionModel", + "sourceFileName": "/src/crawlee/sessions/_models.py" + }, + "2425": { + "qualifiedName": "model_config", + "sourceFileName": "/src/crawlee/sessions/_models.py" + }, + "2426": { + "qualifiedName": "id", + "sourceFileName": "/src/crawlee/sessions/_models.py" + }, + "2427": { + "qualifiedName": "max_age", + "sourceFileName": "/src/crawlee/sessions/_models.py" + }, + "2428": { + "qualifiedName": "user_data", + "sourceFileName": "/src/crawlee/sessions/_models.py" + }, + "2429": { + "qualifiedName": "max_error_score", + "sourceFileName": "/src/crawlee/sessions/_models.py" + }, + "2430": { + "qualifiedName": "error_score_decrement", + "sourceFileName": "/src/crawlee/sessions/_models.py" + }, + "2431": { + "qualifiedName": "created_at", + "sourceFileName": "/src/crawlee/sessions/_models.py" + }, + "2432": { + "qualifiedName": "usage_count", + "sourceFileName": "/src/crawlee/sessions/_models.py" + }, + "2433": { + "qualifiedName": "max_usage_count", + "sourceFileName": "/src/crawlee/sessions/_models.py" + }, + "2434": { + "qualifiedName": "error_score", + "sourceFileName": "/src/crawlee/sessions/_models.py" + }, + "2435": { + "qualifiedName": "cookies", + "sourceFileName": "/src/crawlee/sessions/_models.py" + }, + "2436": { + "qualifiedName": "blocked_status_codes", + "sourceFileName": "/src/crawlee/sessions/_models.py" + }, + "2437": { + "qualifiedName": "SessionPoolModel", + "sourceFileName": "/src/crawlee/sessions/_models.py" + }, + "2438": { + "qualifiedName": "model_config", + "sourceFileName": "/src/crawlee/sessions/_models.py" + }, + "2439": { + "qualifiedName": "max_pool_size", + "sourceFileName": "/src/crawlee/sessions/_models.py" + }, + "2440": { + "qualifiedName": "sessions", + "sourceFileName": "/src/crawlee/sessions/_models.py" + }, + "2441": { + "qualifiedName": "session_count", + "sourceFileName": "/src/crawlee/sessions/_models.py" + }, + "2442": { + "qualifiedName": "usable_session_count", + "sourceFileName": "/src/crawlee/sessions/_models.py" + }, + "2443": { + "qualifiedName": "retired_session_count", + "sourceFileName": "/src/crawlee/sessions/_models.py" + }, + "2444": { + "qualifiedName": "logger", + "sourceFileName": "/src/crawlee/sessions/_session.py" + }, + "2445": { + "qualifiedName": "Session", + "sourceFileName": "/src/crawlee/sessions/_session.py" + }, + "2446": { + "qualifiedName": "__init__", + "sourceFileName": "/src/crawlee/sessions/_session.py" + }, + "2459": { + "qualifiedName": "from_model", + "sourceFileName": "/src/crawlee/sessions/_session.py" + }, + "2462": { + "qualifiedName": "__repr__", + "sourceFileName": "/src/crawlee/sessions/_session.py" + }, + "2464": { + "qualifiedName": "__eq__", + "sourceFileName": "/src/crawlee/sessions/_session.py" + }, + "2467": { + "qualifiedName": "__hash__", + "sourceFileName": "/src/crawlee/sessions/_session.py" + }, + "2469": { + "qualifiedName": "id", + "sourceFileName": "/src/crawlee/sessions/_session.py" + }, + "2470": { + "qualifiedName": "user_data", + "sourceFileName": "/src/crawlee/sessions/_session.py" + }, + "2471": { + "qualifiedName": "cookies", + "sourceFileName": "/src/crawlee/sessions/_session.py" + }, + "2472": { + "qualifiedName": "error_score", + "sourceFileName": "/src/crawlee/sessions/_session.py" + }, + "2473": { + "qualifiedName": "usage_count", + "sourceFileName": "/src/crawlee/sessions/_session.py" + }, + "2474": { + "qualifiedName": "expires_at", + "sourceFileName": "/src/crawlee/sessions/_session.py" + }, + "2475": { + "qualifiedName": "is_blocked", + "sourceFileName": "/src/crawlee/sessions/_session.py" + }, + "2476": { + "qualifiedName": "is_expired", + "sourceFileName": "/src/crawlee/sessions/_session.py" + }, + "2477": { + "qualifiedName": "is_max_usage_count_reached", + "sourceFileName": "/src/crawlee/sessions/_session.py" + }, + "2478": { + "qualifiedName": "is_usable", + "sourceFileName": "/src/crawlee/sessions/_session.py" + }, + "2479": { + "qualifiedName": "get_state", + "sourceFileName": "/src/crawlee/sessions/_session.py" + }, + "2482": { + "qualifiedName": "mark_good", + "sourceFileName": "/src/crawlee/sessions/_session.py" + }, + "2484": { + "qualifiedName": "mark_bad", + "sourceFileName": "/src/crawlee/sessions/_session.py" + }, + "2486": { + "qualifiedName": "retire", + "sourceFileName": "/src/crawlee/sessions/_session.py" + }, + "2488": { + "qualifiedName": "is_blocked_status_code", + "sourceFileName": "/src/crawlee/sessions/_session.py" + }, + "2496": { + "qualifiedName": "logger", + "sourceFileName": "/src/crawlee/sessions/_session_pool.py" + }, + "2497": { + "qualifiedName": "CreateSessionFunctionType", + "sourceFileName": "/src/crawlee/sessions/_session_pool.py" + }, + "2498": { + "qualifiedName": "SessionPool", + "sourceFileName": "/src/crawlee/sessions/_session_pool.py" + }, + "2499": { + "qualifiedName": "__init__", + "sourceFileName": "/src/crawlee/sessions/_session_pool.py" + }, + "2508": { + "qualifiedName": "__repr__", + "sourceFileName": "/src/crawlee/sessions/_session_pool.py" + }, + "2510": { + "qualifiedName": "session_count", + "sourceFileName": "/src/crawlee/sessions/_session_pool.py" + }, + "2511": { + "qualifiedName": "usable_session_count", + "sourceFileName": "/src/crawlee/sessions/_session_pool.py" + }, + "2512": { + "qualifiedName": "retired_session_count", + "sourceFileName": "/src/crawlee/sessions/_session_pool.py" + }, + "2513": { + "qualifiedName": "active", + "sourceFileName": "/src/crawlee/sessions/_session_pool.py" + }, + "2514": { + "qualifiedName": "__aenter__", + "sourceFileName": "/src/crawlee/sessions/_session_pool.py" + }, + "2516": { + "qualifiedName": "__aexit__", + "sourceFileName": "/src/crawlee/sessions/_session_pool.py" + }, + "2521": { + "qualifiedName": "get_state", + "sourceFileName": "/src/crawlee/sessions/_session_pool.py" + }, + "2524": { + "qualifiedName": "add_session", + "sourceFileName": "/src/crawlee/sessions/_session_pool.py" + }, + "2527": { + "qualifiedName": "get_session", + "sourceFileName": "/src/crawlee/sessions/_session_pool.py" + }, + "2529": { + "qualifiedName": "get_session_by_id", + "sourceFileName": "/src/crawlee/sessions/_session_pool.py" + }, + "2532": { + "qualifiedName": "reset_store", + "sourceFileName": "/src/crawlee/sessions/_session_pool.py" + }, + "2538": { + "qualifiedName": "CookieParam", + "sourceFileName": "/src/crawlee/sessions/_cookies.py" + }, + "2539": { + "qualifiedName": "name", + "sourceFileName": "/src/crawlee/sessions/_cookies.py" + }, + "2540": { + "qualifiedName": "value", + "sourceFileName": "/src/crawlee/sessions/_cookies.py" + }, + "2541": { + "qualifiedName": "domain", + "sourceFileName": "/src/crawlee/sessions/_cookies.py" + }, + "2542": { + "qualifiedName": "path", + "sourceFileName": "/src/crawlee/sessions/_cookies.py" + }, + "2543": { + "qualifiedName": "secure", + "sourceFileName": "/src/crawlee/sessions/_cookies.py" + }, + "2544": { + "qualifiedName": "http_only", + "sourceFileName": "/src/crawlee/sessions/_cookies.py" + }, + "2545": { + "qualifiedName": "expires", + "sourceFileName": "/src/crawlee/sessions/_cookies.py" + }, + "2546": { + "qualifiedName": "same_site", + "sourceFileName": "/src/crawlee/sessions/_cookies.py" + }, + "2547": { + "qualifiedName": "PlaywrightCookieParam", + "sourceFileName": "/src/crawlee/sessions/_cookies.py" + }, + "2548": { + "qualifiedName": "name", + "sourceFileName": "/src/crawlee/sessions/_cookies.py" + }, + "2549": { + "qualifiedName": "value", + "sourceFileName": "/src/crawlee/sessions/_cookies.py" + }, + "2550": { + "qualifiedName": "domain", + "sourceFileName": "/src/crawlee/sessions/_cookies.py" + }, + "2551": { + "qualifiedName": "path", + "sourceFileName": "/src/crawlee/sessions/_cookies.py" + }, + "2552": { + "qualifiedName": "secure", + "sourceFileName": "/src/crawlee/sessions/_cookies.py" + }, + "2553": { + "qualifiedName": "httpOnly", + "sourceFileName": "/src/crawlee/sessions/_cookies.py" + }, + "2554": { + "qualifiedName": "expires", + "sourceFileName": "/src/crawlee/sessions/_cookies.py" + }, + "2555": { + "qualifiedName": "sameSite", + "sourceFileName": "/src/crawlee/sessions/_cookies.py" + }, + "2556": { + "qualifiedName": "partitionKey", + "sourceFileName": "/src/crawlee/sessions/_cookies.py" + }, + "2557": { + "qualifiedName": "SessionCookies", + "sourceFileName": "/src/crawlee/sessions/_cookies.py" + }, + "2558": { + "qualifiedName": "__init__", + "sourceFileName": "/src/crawlee/sessions/_cookies.py" + }, + "2561": { + "qualifiedName": "jar", + "sourceFileName": "/src/crawlee/sessions/_cookies.py" + }, + "2562": { + "qualifiedName": "set", + "sourceFileName": "/src/crawlee/sessions/_cookies.py" + }, + "2573": { + "qualifiedName": "get_cookies_as_dicts", + "sourceFileName": "/src/crawlee/sessions/_cookies.py" + }, + "2575": { + "qualifiedName": "store_cookie", + "sourceFileName": "/src/crawlee/sessions/_cookies.py" + }, + "2578": { + "qualifiedName": "store_cookies", + "sourceFileName": "/src/crawlee/sessions/_cookies.py" + }, + "2581": { + "qualifiedName": "set_cookies", + "sourceFileName": "/src/crawlee/sessions/_cookies.py" + }, + "2584": { + "qualifiedName": "get_cookies_as_playwright_format", + "sourceFileName": "/src/crawlee/sessions/_cookies.py" + }, + "2586": { + "qualifiedName": "set_cookies_from_playwright_format", + "sourceFileName": "/src/crawlee/sessions/_cookies.py" + }, + "2589": { + "qualifiedName": "__deepcopy__", + "sourceFileName": "/src/crawlee/sessions/_cookies.py" + }, + "2592": { + "qualifiedName": "__len__", + "sourceFileName": "/src/crawlee/sessions/_cookies.py" + }, + "2594": { + "qualifiedName": "__setitem__", + "sourceFileName": "/src/crawlee/sessions/_cookies.py" + }, + "2598": { + "qualifiedName": "__getitem__", + "sourceFileName": "/src/crawlee/sessions/_cookies.py" + }, + "2601": { + "qualifiedName": "__iter__", + "sourceFileName": "/src/crawlee/sessions/_cookies.py" + }, + "2603": { + "qualifiedName": "__repr__", + "sourceFileName": "/src/crawlee/sessions/_cookies.py" + }, + "2605": { + "qualifiedName": "__bool__", + "sourceFileName": "/src/crawlee/sessions/_cookies.py" + }, + "2607": { + "qualifiedName": "__eq__", + "sourceFileName": "/src/crawlee/sessions/_cookies.py" + }, + "2610": { + "qualifiedName": "__hash__", + "sourceFileName": "/src/crawlee/sessions/_cookies.py" + }, + "2612": { + "qualifiedName": "ErrorSnapshotter", + "sourceFileName": "/src/crawlee/statistics/_error_snapshotter.py" + }, + "2613": { + "qualifiedName": "MAX_ERROR_CHARACTERS", + "sourceFileName": "/src/crawlee/statistics/_error_snapshotter.py" + }, + "2614": { + "qualifiedName": "MAX_HASH_LENGTH", + "sourceFileName": "/src/crawlee/statistics/_error_snapshotter.py" + }, + "2615": { + "qualifiedName": "MAX_FILENAME_LENGTH", + "sourceFileName": "/src/crawlee/statistics/_error_snapshotter.py" + }, + "2616": { + "qualifiedName": "BASE_MESSAGE", + "sourceFileName": "/src/crawlee/statistics/_error_snapshotter.py" + }, + "2617": { + "qualifiedName": "SNAPSHOT_PREFIX", + "sourceFileName": "/src/crawlee/statistics/_error_snapshotter.py" + }, + "2618": { + "qualifiedName": "ALLOWED_CHARACTERS", + "sourceFileName": "/src/crawlee/statistics/_error_snapshotter.py" + }, + "2619": { + "qualifiedName": "__init__", + "sourceFileName": "/src/crawlee/statistics/_error_snapshotter.py" + }, + "2622": { + "qualifiedName": "capture_snapshot", + "sourceFileName": "/src/crawlee/statistics/_error_snapshotter.py" + }, + "2627": { + "qualifiedName": "GroupName", + "sourceFileName": "/src/crawlee/statistics/_error_tracker.py" + }, + "2628": { + "qualifiedName": "ErrorFilenameGroups", + "sourceFileName": "/src/crawlee/statistics/_error_tracker.py" + }, + "2629": { + "qualifiedName": "logger", + "sourceFileName": "/src/crawlee/statistics/_error_tracker.py" + }, + "2630": { + "qualifiedName": "ErrorTracker", + "sourceFileName": "/src/crawlee/statistics/_error_tracker.py" + }, + "2631": { + "qualifiedName": "__init__", + "sourceFileName": "/src/crawlee/statistics/_error_tracker.py" + }, + "2639": { + "qualifiedName": "add", + "sourceFileName": "/src/crawlee/statistics/_error_tracker.py" + }, + "2644": { + "qualifiedName": "unique_error_count", + "sourceFileName": "/src/crawlee/statistics/_error_tracker.py" + }, + "2645": { + "qualifiedName": "total", + "sourceFileName": "/src/crawlee/statistics/_error_tracker.py" + }, + "2646": { + "qualifiedName": "get_most_common_errors", + "sourceFileName": "/src/crawlee/statistics/_error_tracker.py" + }, + "2649": { + "qualifiedName": "FinalStatistics", + "sourceFileName": "/src/crawlee/statistics/_models.py" + }, + "2650": { + "qualifiedName": "requests_finished", + "sourceFileName": "/src/crawlee/statistics/_models.py" + }, + "2651": { + "qualifiedName": "requests_failed", + "sourceFileName": "/src/crawlee/statistics/_models.py" + }, + "2652": { + "qualifiedName": "retry_histogram", + "sourceFileName": "/src/crawlee/statistics/_models.py" + }, + "2653": { + "qualifiedName": "request_avg_failed_duration", + "sourceFileName": "/src/crawlee/statistics/_models.py" + }, + "2654": { + "qualifiedName": "request_avg_finished_duration", + "sourceFileName": "/src/crawlee/statistics/_models.py" + }, + "2655": { + "qualifiedName": "requests_finished_per_minute", + "sourceFileName": "/src/crawlee/statistics/_models.py" + }, + "2656": { + "qualifiedName": "requests_failed_per_minute", + "sourceFileName": "/src/crawlee/statistics/_models.py" + }, + "2657": { + "qualifiedName": "request_total_duration", + "sourceFileName": "/src/crawlee/statistics/_models.py" + }, + "2658": { + "qualifiedName": "requests_total", + "sourceFileName": "/src/crawlee/statistics/_models.py" + }, + "2659": { + "qualifiedName": "crawler_runtime", + "sourceFileName": "/src/crawlee/statistics/_models.py" + }, + "2660": { + "qualifiedName": "to_table", + "sourceFileName": "/src/crawlee/statistics/_models.py" + }, + "2662": { + "qualifiedName": "to_dict", + "sourceFileName": "/src/crawlee/statistics/_models.py" + }, + "2664": { + "qualifiedName": "__str__", + "sourceFileName": "/src/crawlee/statistics/_models.py" + }, + "2666": { + "qualifiedName": "StatisticsState", + "sourceFileName": "/src/crawlee/statistics/_models.py" + }, + "2667": { + "qualifiedName": "model_config", + "sourceFileName": "/src/crawlee/statistics/_models.py" + }, + "2668": { + "qualifiedName": "stats_id", + "sourceFileName": "/src/crawlee/statistics/_models.py" + }, + "2669": { + "qualifiedName": "requests_finished", + "sourceFileName": "/src/crawlee/statistics/_models.py" + }, + "2670": { + "qualifiedName": "requests_failed", + "sourceFileName": "/src/crawlee/statistics/_models.py" + }, + "2671": { + "qualifiedName": "requests_retries", + "sourceFileName": "/src/crawlee/statistics/_models.py" + }, + "2672": { + "qualifiedName": "requests_failed_per_minute", + "sourceFileName": "/src/crawlee/statistics/_models.py" + }, + "2673": { + "qualifiedName": "requests_finished_per_minute", + "sourceFileName": "/src/crawlee/statistics/_models.py" + }, + "2674": { + "qualifiedName": "request_min_duration", + "sourceFileName": "/src/crawlee/statistics/_models.py" + }, + "2675": { + "qualifiedName": "request_max_duration", + "sourceFileName": "/src/crawlee/statistics/_models.py" + }, + "2676": { + "qualifiedName": "request_total_failed_duration", + "sourceFileName": "/src/crawlee/statistics/_models.py" + }, + "2677": { + "qualifiedName": "request_total_finished_duration", + "sourceFileName": "/src/crawlee/statistics/_models.py" + }, + "2678": { + "qualifiedName": "crawler_started_at", + "sourceFileName": "/src/crawlee/statistics/_models.py" + }, + "2679": { + "qualifiedName": "crawler_last_started_at", + "sourceFileName": "/src/crawlee/statistics/_models.py" + }, + "2680": { + "qualifiedName": "crawler_finished_at", + "sourceFileName": "/src/crawlee/statistics/_models.py" + }, + "2681": { + "qualifiedName": "stats_persisted_at", + "sourceFileName": "/src/crawlee/statistics/_models.py" + }, + "2682": { + "qualifiedName": "request_retry_histogram", + "sourceFileName": "/src/crawlee/statistics/_models.py" + }, + "2683": { + "qualifiedName": "model_post_init", + "sourceFileName": "/src/crawlee/statistics/_models.py" + }, + "2686": { + "qualifiedName": "crawler_runtime", + "sourceFileName": "/src/crawlee/statistics/_models.py" + }, + "2687": { + "qualifiedName": "crawler_runtime", + "sourceFileName": "/src/crawlee/statistics/_models.py" + }, + "2690": { + "qualifiedName": "crawler_runtime_for_serialization", + "sourceFileName": "/src/crawlee/statistics/_models.py" + }, + "2692": { + "qualifiedName": "request_total_duration", + "sourceFileName": "/src/crawlee/statistics/_models.py" + }, + "2693": { + "qualifiedName": "request_avg_failed_duration", + "sourceFileName": "/src/crawlee/statistics/_models.py" + }, + "2694": { + "qualifiedName": "request_avg_finished_duration", + "sourceFileName": "/src/crawlee/statistics/_models.py" + }, + "2695": { + "qualifiedName": "requests_total", + "sourceFileName": "/src/crawlee/statistics/_models.py" + }, + "2696": { + "qualifiedName": "TStatisticsState", + "sourceFileName": "/src/crawlee/statistics/_statistics.py" + }, + "2697": { + "qualifiedName": "TNewStatisticsState", + "sourceFileName": "/src/crawlee/statistics/_statistics.py" + }, + "2698": { + "qualifiedName": "logger", + "sourceFileName": "/src/crawlee/statistics/_statistics.py" + }, + "2699": { + "qualifiedName": "RequestProcessingRecord", + "sourceFileName": "/src/crawlee/statistics/_statistics.py" + }, + "2700": { + "qualifiedName": "__init__", + "sourceFileName": "/src/crawlee/statistics/_statistics.py" + }, + "2702": { + "qualifiedName": "run", + "sourceFileName": "/src/crawlee/statistics/_statistics.py" + }, + "2704": { + "qualifiedName": "finish", + "sourceFileName": "/src/crawlee/statistics/_statistics.py" + }, + "2706": { + "qualifiedName": "retry_count", + "sourceFileName": "/src/crawlee/statistics/_statistics.py" + }, + "2707": { + "qualifiedName": "Statistics", + "sourceFileName": "/src/crawlee/statistics/_statistics.py" + }, + "2708": { + "qualifiedName": "__init__", + "sourceFileName": "/src/crawlee/statistics/_statistics.py" + }, + "2720": { + "qualifiedName": "replace_state_model", + "sourceFileName": "/src/crawlee/statistics/_statistics.py" + }, + "2723": { + "qualifiedName": "with_default_state", + "sourceFileName": "/src/crawlee/statistics/_statistics.py" + }, + "2734": { + "qualifiedName": "active", + "sourceFileName": "/src/crawlee/statistics/_statistics.py" + }, + "2735": { + "qualifiedName": "__aenter__", + "sourceFileName": "/src/crawlee/statistics/_statistics.py" + }, + "2737": { + "qualifiedName": "__aexit__", + "sourceFileName": "/src/crawlee/statistics/_statistics.py" + }, + "2742": { + "qualifiedName": "state", + "sourceFileName": "/src/crawlee/statistics/_statistics.py" + }, + "2743": { + "qualifiedName": "register_status_code", + "sourceFileName": "/src/crawlee/statistics/_statistics.py" + }, + "2746": { + "qualifiedName": "record_request_processing_start", + "sourceFileName": "/src/crawlee/statistics/_statistics.py" + }, + "2749": { + "qualifiedName": "record_request_processing_finish", + "sourceFileName": "/src/crawlee/statistics/_statistics.py" + }, + "2752": { + "qualifiedName": "record_request_processing_failure", + "sourceFileName": "/src/crawlee/statistics/_statistics.py" + }, + "2755": { + "qualifiedName": "calculate", + "sourceFileName": "/src/crawlee/statistics/_statistics.py" + }, + "2757": { + "qualifiedName": "reset", + "sourceFileName": "/src/crawlee/statistics/_statistics.py" + }, + "2759": { + "qualifiedName": "RequestQueueClient", + "sourceFileName": "/src/crawlee/storage_clients/_base/_request_queue_client.py" + }, + "2760": { + "qualifiedName": "get_metadata", + "sourceFileName": "/src/crawlee/storage_clients/_base/_request_queue_client.py" + }, + "2762": { + "qualifiedName": "drop", + "sourceFileName": "/src/crawlee/storage_clients/_base/_request_queue_client.py" + }, + "2764": { + "qualifiedName": "purge", + "sourceFileName": "/src/crawlee/storage_clients/_base/_request_queue_client.py" + }, + "2766": { + "qualifiedName": "add_batch_of_requests", + "sourceFileName": "/src/crawlee/storage_clients/_base/_request_queue_client.py" + }, + "2770": { + "qualifiedName": "get_request", + "sourceFileName": "/src/crawlee/storage_clients/_base/_request_queue_client.py" + }, + "2773": { + "qualifiedName": "fetch_next_request", + "sourceFileName": "/src/crawlee/storage_clients/_base/_request_queue_client.py" + }, + "2775": { + "qualifiedName": "mark_request_as_handled", + "sourceFileName": "/src/crawlee/storage_clients/_base/_request_queue_client.py" + }, + "2778": { + "qualifiedName": "reclaim_request", + "sourceFileName": "/src/crawlee/storage_clients/_base/_request_queue_client.py" + }, + "2782": { + "qualifiedName": "is_empty", + "sourceFileName": "/src/crawlee/storage_clients/_base/_request_queue_client.py" + }, + "2784": { + "qualifiedName": "StorageClient", + "sourceFileName": "/src/crawlee/storage_clients/_base/_storage_client.py" + }, + "2785": { + "qualifiedName": "get_storage_client_cache_key", + "sourceFileName": "/src/crawlee/storage_clients/_base/_storage_client.py" + }, + "2788": { + "qualifiedName": "create_dataset_client", + "sourceFileName": "/src/crawlee/storage_clients/_base/_storage_client.py" + }, + "2794": { + "qualifiedName": "create_kvs_client", + "sourceFileName": "/src/crawlee/storage_clients/_base/_storage_client.py" + }, + "2800": { + "qualifiedName": "create_rq_client", + "sourceFileName": "/src/crawlee/storage_clients/_base/_storage_client.py" + }, + "2806": { + "qualifiedName": "get_rate_limit_errors", + "sourceFileName": "/src/crawlee/storage_clients/_base/_storage_client.py" + }, + "2808": { + "qualifiedName": "DatasetClient", + "sourceFileName": "/src/crawlee/storage_clients/_base/_dataset_client.py" + }, + "2809": { + "qualifiedName": "get_metadata", + "sourceFileName": "/src/crawlee/storage_clients/_base/_dataset_client.py" + }, + "2811": { + "qualifiedName": "drop", + "sourceFileName": "/src/crawlee/storage_clients/_base/_dataset_client.py" + }, + "2813": { + "qualifiedName": "purge", + "sourceFileName": "/src/crawlee/storage_clients/_base/_dataset_client.py" + }, + "2815": { + "qualifiedName": "push_data", + "sourceFileName": "/src/crawlee/storage_clients/_base/_dataset_client.py" + }, + "2818": { + "qualifiedName": "get_data", + "sourceFileName": "/src/crawlee/storage_clients/_base/_dataset_client.py" + }, + "2831": { + "qualifiedName": "iterate_items", + "sourceFileName": "/src/crawlee/storage_clients/_base/_dataset_client.py" + }, + "2842": { + "qualifiedName": "KeyValueStoreClient", + "sourceFileName": "/src/crawlee/storage_clients/_base/_key_value_store_client.py" + }, + "2843": { + "qualifiedName": "get_metadata", + "sourceFileName": "/src/crawlee/storage_clients/_base/_key_value_store_client.py" + }, + "2845": { + "qualifiedName": "drop", + "sourceFileName": "/src/crawlee/storage_clients/_base/_key_value_store_client.py" + }, + "2847": { + "qualifiedName": "purge", + "sourceFileName": "/src/crawlee/storage_clients/_base/_key_value_store_client.py" + }, + "2849": { + "qualifiedName": "get_value", + "sourceFileName": "/src/crawlee/storage_clients/_base/_key_value_store_client.py" + }, + "2852": { + "qualifiedName": "set_value", + "sourceFileName": "/src/crawlee/storage_clients/_base/_key_value_store_client.py" + }, + "2857": { + "qualifiedName": "delete_value", + "sourceFileName": "/src/crawlee/storage_clients/_base/_key_value_store_client.py" + }, + "2860": { + "qualifiedName": "iterate_keys", + "sourceFileName": "/src/crawlee/storage_clients/_base/_key_value_store_client.py" + }, + "2864": { + "qualifiedName": "get_public_url", + "sourceFileName": "/src/crawlee/storage_clients/_base/_key_value_store_client.py" + }, + "2867": { + "qualifiedName": "record_exists", + "sourceFileName": "/src/crawlee/storage_clients/_base/_key_value_store_client.py" + }, + "2870": { + "qualifiedName": "FileSystemStorageClient", + "sourceFileName": "/src/crawlee/storage_clients/_file_system/_storage_client.py" + }, + "2871": { + "qualifiedName": "get_storage_client_cache_key", + "sourceFileName": "/src/crawlee/storage_clients/_file_system/_storage_client.py" + }, + "2874": { + "qualifiedName": "create_dataset_client", + "sourceFileName": "/src/crawlee/storage_clients/_file_system/_storage_client.py" + }, + "2880": { + "qualifiedName": "create_kvs_client", + "sourceFileName": "/src/crawlee/storage_clients/_file_system/_storage_client.py" + }, + "2886": { + "qualifiedName": "create_rq_client", + "sourceFileName": "/src/crawlee/storage_clients/_file_system/_storage_client.py" + }, + "2892": { + "qualifiedName": "logger", + "sourceFileName": "/src/crawlee/storage_clients/_file_system/_dataset_client.py" + }, + "2893": { + "qualifiedName": "FileSystemDatasetClient", + "sourceFileName": "/src/crawlee/storage_clients/_file_system/_dataset_client.py" + }, + "2894": { + "qualifiedName": "__init__", + "sourceFileName": "/src/crawlee/storage_clients/_file_system/_dataset_client.py" + }, + "2899": { + "qualifiedName": "get_metadata", + "sourceFileName": "/src/crawlee/storage_clients/_file_system/_dataset_client.py" + }, + "2901": { + "qualifiedName": "path_to_dataset", + "sourceFileName": "/src/crawlee/storage_clients/_file_system/_dataset_client.py" + }, + "2902": { + "qualifiedName": "path_to_metadata", + "sourceFileName": "/src/crawlee/storage_clients/_file_system/_dataset_client.py" + }, + "2903": { + "qualifiedName": "open", + "sourceFileName": "/src/crawlee/storage_clients/_file_system/_dataset_client.py" + }, + "2909": { + "qualifiedName": "drop", + "sourceFileName": "/src/crawlee/storage_clients/_file_system/_dataset_client.py" + }, + "2911": { + "qualifiedName": "purge", + "sourceFileName": "/src/crawlee/storage_clients/_file_system/_dataset_client.py" + }, + "2913": { + "qualifiedName": "push_data", + "sourceFileName": "/src/crawlee/storage_clients/_file_system/_dataset_client.py" + }, + "2916": { + "qualifiedName": "get_data", + "sourceFileName": "/src/crawlee/storage_clients/_file_system/_dataset_client.py" + }, + "2929": { + "qualifiedName": "iterate_items", + "sourceFileName": "/src/crawlee/storage_clients/_file_system/_dataset_client.py" + }, + "2940": { + "qualifiedName": "logger", + "sourceFileName": "/src/crawlee/storage_clients/_file_system/_key_value_store_client.py" + }, + "2941": { + "qualifiedName": "FileSystemKeyValueStoreClient", + "sourceFileName": "/src/crawlee/storage_clients/_file_system/_key_value_store_client.py" + }, + "2942": { + "qualifiedName": "__init__", + "sourceFileName": "/src/crawlee/storage_clients/_file_system/_key_value_store_client.py" + }, + "2947": { + "qualifiedName": "get_metadata", + "sourceFileName": "/src/crawlee/storage_clients/_file_system/_key_value_store_client.py" + }, + "2949": { + "qualifiedName": "path_to_kvs", + "sourceFileName": "/src/crawlee/storage_clients/_file_system/_key_value_store_client.py" + }, + "2950": { + "qualifiedName": "path_to_metadata", + "sourceFileName": "/src/crawlee/storage_clients/_file_system/_key_value_store_client.py" + }, + "2951": { + "qualifiedName": "open", + "sourceFileName": "/src/crawlee/storage_clients/_file_system/_key_value_store_client.py" + }, + "2957": { + "qualifiedName": "drop", + "sourceFileName": "/src/crawlee/storage_clients/_file_system/_key_value_store_client.py" + }, + "2959": { + "qualifiedName": "purge", + "sourceFileName": "/src/crawlee/storage_clients/_file_system/_key_value_store_client.py" + }, + "2961": { + "qualifiedName": "get_value", + "sourceFileName": "/src/crawlee/storage_clients/_file_system/_key_value_store_client.py" + }, + "2964": { + "qualifiedName": "set_value", + "sourceFileName": "/src/crawlee/storage_clients/_file_system/_key_value_store_client.py" + }, + "2969": { + "qualifiedName": "delete_value", + "sourceFileName": "/src/crawlee/storage_clients/_file_system/_key_value_store_client.py" + }, + "2972": { + "qualifiedName": "iterate_keys", + "sourceFileName": "/src/crawlee/storage_clients/_file_system/_key_value_store_client.py" + }, + "2976": { + "qualifiedName": "get_public_url", + "sourceFileName": "/src/crawlee/storage_clients/_file_system/_key_value_store_client.py" + }, + "2979": { + "qualifiedName": "record_exists", + "sourceFileName": "/src/crawlee/storage_clients/_file_system/_key_value_store_client.py" + }, + "2982": { + "qualifiedName": "logger", + "sourceFileName": "/src/crawlee/storage_clients/_file_system/_request_queue_client.py" + }, + "2983": { + "qualifiedName": "RequestQueueState", + "sourceFileName": "/src/crawlee/storage_clients/_file_system/_request_queue_client.py" + }, + "2984": { + "qualifiedName": "sequence_counter", + "sourceFileName": "/src/crawlee/storage_clients/_file_system/_request_queue_client.py" + }, + "2985": { + "qualifiedName": "forefront_sequence_counter", + "sourceFileName": "/src/crawlee/storage_clients/_file_system/_request_queue_client.py" + }, + "2986": { + "qualifiedName": "forefront_requests", + "sourceFileName": "/src/crawlee/storage_clients/_file_system/_request_queue_client.py" + }, + "2987": { + "qualifiedName": "regular_requests", + "sourceFileName": "/src/crawlee/storage_clients/_file_system/_request_queue_client.py" + }, + "2988": { + "qualifiedName": "in_progress_requests", + "sourceFileName": "/src/crawlee/storage_clients/_file_system/_request_queue_client.py" + }, + "2989": { + "qualifiedName": "handled_requests", + "sourceFileName": "/src/crawlee/storage_clients/_file_system/_request_queue_client.py" + }, + "2990": { + "qualifiedName": "FileSystemRequestQueueClient", + "sourceFileName": "/src/crawlee/storage_clients/_file_system/_request_queue_client.py" + }, + "2991": { + "qualifiedName": "__init__", + "sourceFileName": "/src/crawlee/storage_clients/_file_system/_request_queue_client.py" + }, + "2997": { + "qualifiedName": "get_metadata", + "sourceFileName": "/src/crawlee/storage_clients/_file_system/_request_queue_client.py" + }, + "2999": { + "qualifiedName": "path_to_rq", + "sourceFileName": "/src/crawlee/storage_clients/_file_system/_request_queue_client.py" + }, + "3000": { + "qualifiedName": "path_to_metadata", + "sourceFileName": "/src/crawlee/storage_clients/_file_system/_request_queue_client.py" + }, + "3001": { + "qualifiedName": "open", + "sourceFileName": "/src/crawlee/storage_clients/_file_system/_request_queue_client.py" + }, + "3007": { + "qualifiedName": "drop", + "sourceFileName": "/src/crawlee/storage_clients/_file_system/_request_queue_client.py" + }, + "3009": { + "qualifiedName": "purge", + "sourceFileName": "/src/crawlee/storage_clients/_file_system/_request_queue_client.py" + }, + "3011": { + "qualifiedName": "add_batch_of_requests", + "sourceFileName": "/src/crawlee/storage_clients/_file_system/_request_queue_client.py" + }, + "3015": { + "qualifiedName": "get_request", + "sourceFileName": "/src/crawlee/storage_clients/_file_system/_request_queue_client.py" + }, + "3018": { + "qualifiedName": "fetch_next_request", + "sourceFileName": "/src/crawlee/storage_clients/_file_system/_request_queue_client.py" + }, + "3020": { + "qualifiedName": "mark_request_as_handled", + "sourceFileName": "/src/crawlee/storage_clients/_file_system/_request_queue_client.py" + }, + "3023": { + "qualifiedName": "reclaim_request", + "sourceFileName": "/src/crawlee/storage_clients/_file_system/_request_queue_client.py" + }, + "3027": { + "qualifiedName": "is_empty", + "sourceFileName": "/src/crawlee/storage_clients/_file_system/_request_queue_client.py" + }, + "3029": { + "qualifiedName": "logger", + "sourceFileName": "/src/crawlee/storage_clients/_memory/_dataset_client.py" + }, + "3030": { + "qualifiedName": "MemoryDatasetClient", + "sourceFileName": "/src/crawlee/storage_clients/_memory/_dataset_client.py" + }, + "3031": { + "qualifiedName": "__init__", + "sourceFileName": "/src/crawlee/storage_clients/_memory/_dataset_client.py" + }, + "3034": { + "qualifiedName": "get_metadata", + "sourceFileName": "/src/crawlee/storage_clients/_memory/_dataset_client.py" + }, + "3036": { + "qualifiedName": "open", + "sourceFileName": "/src/crawlee/storage_clients/_memory/_dataset_client.py" + }, + "3041": { + "qualifiedName": "drop", + "sourceFileName": "/src/crawlee/storage_clients/_memory/_dataset_client.py" + }, + "3043": { + "qualifiedName": "purge", + "sourceFileName": "/src/crawlee/storage_clients/_memory/_dataset_client.py" + }, + "3045": { + "qualifiedName": "push_data", + "sourceFileName": "/src/crawlee/storage_clients/_memory/_dataset_client.py" + }, + "3048": { + "qualifiedName": "get_data", + "sourceFileName": "/src/crawlee/storage_clients/_memory/_dataset_client.py" + }, + "3061": { + "qualifiedName": "iterate_items", + "sourceFileName": "/src/crawlee/storage_clients/_memory/_dataset_client.py" + }, + "3072": { + "qualifiedName": "MemoryKeyValueStoreClient", + "sourceFileName": "/src/crawlee/storage_clients/_memory/_key_value_store_client.py" + }, + "3073": { + "qualifiedName": "__init__", + "sourceFileName": "/src/crawlee/storage_clients/_memory/_key_value_store_client.py" + }, + "3076": { + "qualifiedName": "get_metadata", + "sourceFileName": "/src/crawlee/storage_clients/_memory/_key_value_store_client.py" + }, + "3078": { + "qualifiedName": "open", + "sourceFileName": "/src/crawlee/storage_clients/_memory/_key_value_store_client.py" + }, + "3083": { + "qualifiedName": "drop", + "sourceFileName": "/src/crawlee/storage_clients/_memory/_key_value_store_client.py" + }, + "3085": { + "qualifiedName": "purge", + "sourceFileName": "/src/crawlee/storage_clients/_memory/_key_value_store_client.py" + }, + "3087": { + "qualifiedName": "get_value", + "sourceFileName": "/src/crawlee/storage_clients/_memory/_key_value_store_client.py" + }, + "3090": { + "qualifiedName": "set_value", + "sourceFileName": "/src/crawlee/storage_clients/_memory/_key_value_store_client.py" + }, + "3095": { + "qualifiedName": "delete_value", + "sourceFileName": "/src/crawlee/storage_clients/_memory/_key_value_store_client.py" + }, + "3098": { + "qualifiedName": "iterate_keys", + "sourceFileName": "/src/crawlee/storage_clients/_memory/_key_value_store_client.py" + }, + "3102": { + "qualifiedName": "get_public_url", + "sourceFileName": "/src/crawlee/storage_clients/_memory/_key_value_store_client.py" + }, + "3105": { + "qualifiedName": "record_exists", + "sourceFileName": "/src/crawlee/storage_clients/_memory/_key_value_store_client.py" + }, + "3108": { + "qualifiedName": "MemoryStorageClient", + "sourceFileName": "/src/crawlee/storage_clients/_memory/_storage_client.py" + }, + "3109": { + "qualifiedName": "create_dataset_client", + "sourceFileName": "/src/crawlee/storage_clients/_memory/_storage_client.py" + }, + "3115": { + "qualifiedName": "create_kvs_client", + "sourceFileName": "/src/crawlee/storage_clients/_memory/_storage_client.py" + }, + "3121": { + "qualifiedName": "create_rq_client", + "sourceFileName": "/src/crawlee/storage_clients/_memory/_storage_client.py" + }, + "3127": { + "qualifiedName": "logger", + "sourceFileName": "/src/crawlee/storage_clients/_memory/_request_queue_client.py" + }, + "3128": { + "qualifiedName": "MemoryRequestQueueClient", + "sourceFileName": "/src/crawlee/storage_clients/_memory/_request_queue_client.py" + }, + "3129": { + "qualifiedName": "__init__", + "sourceFileName": "/src/crawlee/storage_clients/_memory/_request_queue_client.py" + }, + "3132": { + "qualifiedName": "get_metadata", + "sourceFileName": "/src/crawlee/storage_clients/_memory/_request_queue_client.py" + }, + "3134": { + "qualifiedName": "open", + "sourceFileName": "/src/crawlee/storage_clients/_memory/_request_queue_client.py" + }, + "3139": { + "qualifiedName": "drop", + "sourceFileName": "/src/crawlee/storage_clients/_memory/_request_queue_client.py" + }, + "3141": { + "qualifiedName": "purge", + "sourceFileName": "/src/crawlee/storage_clients/_memory/_request_queue_client.py" + }, + "3143": { + "qualifiedName": "add_batch_of_requests", + "sourceFileName": "/src/crawlee/storage_clients/_memory/_request_queue_client.py" + }, + "3147": { + "qualifiedName": "fetch_next_request", + "sourceFileName": "/src/crawlee/storage_clients/_memory/_request_queue_client.py" + }, + "3149": { + "qualifiedName": "get_request", + "sourceFileName": "/src/crawlee/storage_clients/_memory/_request_queue_client.py" + }, + "3152": { + "qualifiedName": "mark_request_as_handled", + "sourceFileName": "/src/crawlee/storage_clients/_memory/_request_queue_client.py" + }, + "3155": { + "qualifiedName": "reclaim_request", + "sourceFileName": "/src/crawlee/storage_clients/_memory/_request_queue_client.py" + }, + "3159": { + "qualifiedName": "is_empty", + "sourceFileName": "/src/crawlee/storage_clients/_memory/_request_queue_client.py" + }, + "3161": { + "qualifiedName": "logger", + "sourceFileName": "/src/crawlee/storage_clients/_redis/_client_mixin.py" + }, + "3162": { + "qualifiedName": "MetadataUpdateParams", + "sourceFileName": "/src/crawlee/storage_clients/_redis/_client_mixin.py" + }, + "3163": { + "qualifiedName": "update_accessed_at", + "sourceFileName": "/src/crawlee/storage_clients/_redis/_client_mixin.py" + }, + "3164": { + "qualifiedName": "update_modified_at", + "sourceFileName": "/src/crawlee/storage_clients/_redis/_client_mixin.py" + }, + "3165": { + "qualifiedName": "RedisClientMixin", + "sourceFileName": "/src/crawlee/storage_clients/_redis/_client_mixin.py" + }, + "3166": { + "qualifiedName": "__init__", + "sourceFileName": "/src/crawlee/storage_clients/_redis/_client_mixin.py" + }, + "3171": { + "qualifiedName": "redis", + "sourceFileName": "/src/crawlee/storage_clients/_redis/_client_mixin.py" + }, + "3172": { + "qualifiedName": "metadata_key", + "sourceFileName": "/src/crawlee/storage_clients/_redis/_client_mixin.py" + }, + "3173": { + "qualifiedName": "logger", + "sourceFileName": "/src/crawlee/storage_clients/_redis/_dataset_client.py" + }, + "3174": { + "qualifiedName": "_DatasetMetadataUpdateParams", + "sourceFileName": "/src/crawlee/storage_clients/_redis/_dataset_client.py" + }, + "3175": { + "qualifiedName": "new_item_count", + "sourceFileName": "/src/crawlee/storage_clients/_redis/_dataset_client.py" + }, + "3176": { + "qualifiedName": "delta_item_count", + "sourceFileName": "/src/crawlee/storage_clients/_redis/_dataset_client.py" + }, + "3177": { + "qualifiedName": "RedisDatasetClient", + "sourceFileName": "/src/crawlee/storage_clients/_redis/_dataset_client.py" + }, + "3178": { + "qualifiedName": "__init__", + "sourceFileName": "/src/crawlee/storage_clients/_redis/_dataset_client.py" + }, + "3183": { + "qualifiedName": "open", + "sourceFileName": "/src/crawlee/storage_clients/_redis/_dataset_client.py" + }, + "3189": { + "qualifiedName": "get_metadata", + "sourceFileName": "/src/crawlee/storage_clients/_redis/_dataset_client.py" + }, + "3191": { + "qualifiedName": "drop", + "sourceFileName": "/src/crawlee/storage_clients/_redis/_dataset_client.py" + }, + "3193": { + "qualifiedName": "purge", + "sourceFileName": "/src/crawlee/storage_clients/_redis/_dataset_client.py" + }, + "3195": { + "qualifiedName": "push_data", + "sourceFileName": "/src/crawlee/storage_clients/_redis/_dataset_client.py" + }, + "3198": { + "qualifiedName": "get_data", + "sourceFileName": "/src/crawlee/storage_clients/_redis/_dataset_client.py" + }, + "3211": { + "qualifiedName": "iterate_items", + "sourceFileName": "/src/crawlee/storage_clients/_redis/_dataset_client.py" + }, + "3222": { + "qualifiedName": "RedisStorageClient", + "sourceFileName": "/src/crawlee/storage_clients/_redis/_storage_client.py" + }, + "3223": { + "qualifiedName": "__init__", + "sourceFileName": "/src/crawlee/storage_clients/_redis/_storage_client.py" + }, + "3229": { + "qualifiedName": "create_dataset_client", + "sourceFileName": "/src/crawlee/storage_clients/_redis/_storage_client.py" + }, + "3235": { + "qualifiedName": "create_kvs_client", + "sourceFileName": "/src/crawlee/storage_clients/_redis/_storage_client.py" + }, + "3241": { + "qualifiedName": "create_rq_client", + "sourceFileName": "/src/crawlee/storage_clients/_redis/_storage_client.py" + }, + "3247": { + "qualifiedName": "logger", + "sourceFileName": "/src/crawlee/storage_clients/_redis/_key_value_store_client.py" + }, + "3248": { + "qualifiedName": "RedisKeyValueStoreClient", + "sourceFileName": "/src/crawlee/storage_clients/_redis/_key_value_store_client.py" + }, + "3249": { + "qualifiedName": "__init__", + "sourceFileName": "/src/crawlee/storage_clients/_redis/_key_value_store_client.py" + }, + "3254": { + "qualifiedName": "open", + "sourceFileName": "/src/crawlee/storage_clients/_redis/_key_value_store_client.py" + }, + "3260": { + "qualifiedName": "get_metadata", + "sourceFileName": "/src/crawlee/storage_clients/_redis/_key_value_store_client.py" + }, + "3262": { + "qualifiedName": "drop", + "sourceFileName": "/src/crawlee/storage_clients/_redis/_key_value_store_client.py" + }, + "3264": { + "qualifiedName": "purge", + "sourceFileName": "/src/crawlee/storage_clients/_redis/_key_value_store_client.py" + }, + "3266": { + "qualifiedName": "set_value", + "sourceFileName": "/src/crawlee/storage_clients/_redis/_key_value_store_client.py" + }, + "3271": { + "qualifiedName": "get_value", + "sourceFileName": "/src/crawlee/storage_clients/_redis/_key_value_store_client.py" + }, + "3274": { + "qualifiedName": "delete_value", + "sourceFileName": "/src/crawlee/storage_clients/_redis/_key_value_store_client.py" + }, + "3277": { + "qualifiedName": "iterate_keys", + "sourceFileName": "/src/crawlee/storage_clients/_redis/_key_value_store_client.py" + }, + "3281": { + "qualifiedName": "get_public_url", + "sourceFileName": "/src/crawlee/storage_clients/_redis/_key_value_store_client.py" + }, + "3284": { + "qualifiedName": "record_exists", + "sourceFileName": "/src/crawlee/storage_clients/_redis/_key_value_store_client.py" + }, + "3287": { + "qualifiedName": "T", + "sourceFileName": "/src/crawlee/storage_clients/_redis/_utils.py" + }, + "3288": { + "qualifiedName": "await_redis_response", + "sourceFileName": "/src/crawlee/storage_clients/_redis/_utils.py" + }, + "3291": { + "qualifiedName": "read_lua_script", + "sourceFileName": "/src/crawlee/storage_clients/_redis/_utils.py" + }, + "3294": { + "qualifiedName": "logger", + "sourceFileName": "/src/crawlee/storage_clients/_redis/_request_queue_client.py" + }, + "3295": { + "qualifiedName": "_QueueMetadataUpdateParams", + "sourceFileName": "/src/crawlee/storage_clients/_redis/_request_queue_client.py" + }, + "3296": { + "qualifiedName": "new_handled_request_count", + "sourceFileName": "/src/crawlee/storage_clients/_redis/_request_queue_client.py" + }, + "3297": { + "qualifiedName": "new_pending_request_count", + "sourceFileName": "/src/crawlee/storage_clients/_redis/_request_queue_client.py" + }, + "3298": { + "qualifiedName": "new_total_request_count", + "sourceFileName": "/src/crawlee/storage_clients/_redis/_request_queue_client.py" + }, + "3299": { + "qualifiedName": "delta_handled_request_count", + "sourceFileName": "/src/crawlee/storage_clients/_redis/_request_queue_client.py" + }, + "3300": { + "qualifiedName": "delta_pending_request_count", + "sourceFileName": "/src/crawlee/storage_clients/_redis/_request_queue_client.py" + }, + "3301": { + "qualifiedName": "delta_total_request_count", + "sourceFileName": "/src/crawlee/storage_clients/_redis/_request_queue_client.py" + }, + "3302": { + "qualifiedName": "recalculate", + "sourceFileName": "/src/crawlee/storage_clients/_redis/_request_queue_client.py" + }, + "3303": { + "qualifiedName": "update_had_multiple_clients", + "sourceFileName": "/src/crawlee/storage_clients/_redis/_request_queue_client.py" + }, + "3304": { + "qualifiedName": "RedisRequestQueueClient", + "sourceFileName": "/src/crawlee/storage_clients/_redis/_request_queue_client.py" + }, + "3305": { + "qualifiedName": "__init__", + "sourceFileName": "/src/crawlee/storage_clients/_redis/_request_queue_client.py" + }, + "3312": { + "qualifiedName": "open", + "sourceFileName": "/src/crawlee/storage_clients/_redis/_request_queue_client.py" + }, + "3320": { + "qualifiedName": "get_metadata", + "sourceFileName": "/src/crawlee/storage_clients/_redis/_request_queue_client.py" + }, + "3322": { + "qualifiedName": "drop", + "sourceFileName": "/src/crawlee/storage_clients/_redis/_request_queue_client.py" + }, + "3324": { + "qualifiedName": "purge", + "sourceFileName": "/src/crawlee/storage_clients/_redis/_request_queue_client.py" + }, + "3326": { + "qualifiedName": "add_batch_of_requests", + "sourceFileName": "/src/crawlee/storage_clients/_redis/_request_queue_client.py" + }, + "3330": { + "qualifiedName": "fetch_next_request", + "sourceFileName": "/src/crawlee/storage_clients/_redis/_request_queue_client.py" + }, + "3332": { + "qualifiedName": "get_request", + "sourceFileName": "/src/crawlee/storage_clients/_redis/_request_queue_client.py" + }, + "3335": { + "qualifiedName": "mark_request_as_handled", + "sourceFileName": "/src/crawlee/storage_clients/_redis/_request_queue_client.py" + }, + "3338": { + "qualifiedName": "reclaim_request", + "sourceFileName": "/src/crawlee/storage_clients/_redis/_request_queue_client.py" + }, + "3342": { + "qualifiedName": "is_empty", + "sourceFileName": "/src/crawlee/storage_clients/_redis/_request_queue_client.py" + }, + "3344": { + "qualifiedName": "logger", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_dataset_client.py" + }, + "3345": { + "qualifiedName": "_DatasetMetadataUpdateParams", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_dataset_client.py" + }, + "3346": { + "qualifiedName": "new_item_count", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_dataset_client.py" + }, + "3347": { + "qualifiedName": "delta_item_count", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_dataset_client.py" + }, + "3348": { + "qualifiedName": "SqlDatasetClient", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_dataset_client.py" + }, + "3349": { + "qualifiedName": "__init__", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_dataset_client.py" + }, + "3353": { + "qualifiedName": "open", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_dataset_client.py" + }, + "3359": { + "qualifiedName": "get_metadata", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_dataset_client.py" + }, + "3361": { + "qualifiedName": "drop", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_dataset_client.py" + }, + "3363": { + "qualifiedName": "purge", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_dataset_client.py" + }, + "3365": { + "qualifiedName": "push_data", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_dataset_client.py" + }, + "3368": { + "qualifiedName": "get_data", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_dataset_client.py" + }, + "3381": { + "qualifiedName": "iterate_items", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_dataset_client.py" + }, + "3392": { + "qualifiedName": "AwareDateTime", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3393": { + "qualifiedName": "impl", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3394": { + "qualifiedName": "cache_ok", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3395": { + "qualifiedName": "process_result_value", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3399": { + "qualifiedName": "JsonField", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3400": { + "qualifiedName": "impl", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3401": { + "qualifiedName": "cache_ok", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3402": { + "qualifiedName": "load_dialect_impl", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3405": { + "qualifiedName": "Base", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3406": { + "qualifiedName": "StorageMetadataDb", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3407": { + "qualifiedName": "internal_name", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3408": { + "qualifiedName": "name", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3409": { + "qualifiedName": "accessed_at", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3410": { + "qualifiedName": "created_at", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3411": { + "qualifiedName": "modified_at", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3412": { + "qualifiedName": "buffer_locked_until", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3413": { + "qualifiedName": "DatasetMetadataDb", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3414": { + "qualifiedName": "__tablename__", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3415": { + "qualifiedName": "dataset_id", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3416": { + "qualifiedName": "item_count", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3417": { + "qualifiedName": "items", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3418": { + "qualifiedName": "id", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3419": { + "qualifiedName": "RequestQueueMetadataDb", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3420": { + "qualifiedName": "__tablename__", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3421": { + "qualifiedName": "request_queue_id", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3422": { + "qualifiedName": "had_multiple_clients", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3423": { + "qualifiedName": "handled_request_count", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3424": { + "qualifiedName": "pending_request_count", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3425": { + "qualifiedName": "total_request_count", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3426": { + "qualifiedName": "requests", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3427": { + "qualifiedName": "state", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3428": { + "qualifiedName": "id", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3429": { + "qualifiedName": "KeyValueStoreMetadataDb", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3430": { + "qualifiedName": "__tablename__", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3431": { + "qualifiedName": "key_value_store_id", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3432": { + "qualifiedName": "records", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3433": { + "qualifiedName": "id", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3434": { + "qualifiedName": "KeyValueStoreRecordDb", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3435": { + "qualifiedName": "__tablename__", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3436": { + "qualifiedName": "key_value_store_id", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3437": { + "qualifiedName": "key", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3438": { + "qualifiedName": "value", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3439": { + "qualifiedName": "content_type", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3440": { + "qualifiedName": "size", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3441": { + "qualifiedName": "kvs", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3442": { + "qualifiedName": "storage_id", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3443": { + "qualifiedName": "DatasetItemDb", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3444": { + "qualifiedName": "__tablename__", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3445": { + "qualifiedName": "item_id", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3446": { + "qualifiedName": "dataset_id", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3447": { + "qualifiedName": "data", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3448": { + "qualifiedName": "dataset", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3449": { + "qualifiedName": "storage_id", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3450": { + "qualifiedName": "RequestDb", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3451": { + "qualifiedName": "__tablename__", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3452": { + "qualifiedName": "__table_args__", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3453": { + "qualifiedName": "request_id", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3454": { + "qualifiedName": "request_queue_id", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3455": { + "qualifiedName": "data", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3456": { + "qualifiedName": "sequence_number", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3457": { + "qualifiedName": "is_handled", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3458": { + "qualifiedName": "time_blocked_until", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3459": { + "qualifiedName": "client_key", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3460": { + "qualifiedName": "queue", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3461": { + "qualifiedName": "storage_id", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3462": { + "qualifiedName": "RequestQueueStateDb", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3463": { + "qualifiedName": "__tablename__", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3464": { + "qualifiedName": "request_queue_id", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3465": { + "qualifiedName": "sequence_counter", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3466": { + "qualifiedName": "forefront_sequence_counter", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3467": { + "qualifiedName": "queue", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3468": { + "qualifiedName": "VersionDb", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3469": { + "qualifiedName": "__tablename__", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3470": { + "qualifiedName": "version", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3471": { + "qualifiedName": "MetadataBufferDb", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3472": { + "qualifiedName": "id", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3473": { + "qualifiedName": "accessed_at", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3474": { + "qualifiedName": "modified_at", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3475": { + "qualifiedName": "KeyValueStoreMetadataBufferDb", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3476": { + "qualifiedName": "__tablename__", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3477": { + "qualifiedName": "key_value_store_id", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3478": { + "qualifiedName": "storage_id", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3479": { + "qualifiedName": "DatasetMetadataBufferDb", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3480": { + "qualifiedName": "__tablename__", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3481": { + "qualifiedName": "dataset_id", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3482": { + "qualifiedName": "delta_item_count", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3483": { + "qualifiedName": "storage_id", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3484": { + "qualifiedName": "RequestQueueMetadataBufferDb", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3485": { + "qualifiedName": "__tablename__", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3486": { + "qualifiedName": "__table_args__", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3487": { + "qualifiedName": "request_queue_id", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3488": { + "qualifiedName": "client_id", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3489": { + "qualifiedName": "delta_handled_count", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3490": { + "qualifiedName": "delta_pending_count", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3491": { + "qualifiedName": "delta_total_count", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3492": { + "qualifiedName": "need_recalc", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3493": { + "qualifiedName": "storage_id", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3494": { + "qualifiedName": "logger", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_key_value_store_client.py" + }, + "3495": { + "qualifiedName": "SqlKeyValueStoreClient", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_key_value_store_client.py" + }, + "3496": { + "qualifiedName": "__init__", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_key_value_store_client.py" + }, + "3500": { + "qualifiedName": "open", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_key_value_store_client.py" + }, + "3506": { + "qualifiedName": "get_metadata", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_key_value_store_client.py" + }, + "3508": { + "qualifiedName": "drop", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_key_value_store_client.py" + }, + "3510": { + "qualifiedName": "purge", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_key_value_store_client.py" + }, + "3512": { + "qualifiedName": "set_value", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_key_value_store_client.py" + }, + "3517": { + "qualifiedName": "get_value", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_key_value_store_client.py" + }, + "3520": { + "qualifiedName": "delete_value", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_key_value_store_client.py" + }, + "3523": { + "qualifiedName": "iterate_keys", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_key_value_store_client.py" + }, + "3527": { + "qualifiedName": "record_exists", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_key_value_store_client.py" + }, + "3530": { + "qualifiedName": "get_public_url", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_key_value_store_client.py" + }, + "3533": { + "qualifiedName": "logger", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_client_mixin.py" + }, + "3534": { + "qualifiedName": "MetadataUpdateParams", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_client_mixin.py" + }, + "3535": { + "qualifiedName": "accessed_at", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_client_mixin.py" + }, + "3536": { + "qualifiedName": "modified_at", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_client_mixin.py" + }, + "3537": { + "qualifiedName": "SqlClientMixin", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_client_mixin.py" + }, + "3538": { + "qualifiedName": "__init__", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_client_mixin.py" + }, + "3542": { + "qualifiedName": "get_session", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_client_mixin.py" + }, + "3545": { + "qualifiedName": "logger", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_request_queue_client.py" + }, + "3546": { + "qualifiedName": "_QueueMetadataUpdateParams", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_request_queue_client.py" + }, + "3547": { + "qualifiedName": "new_handled_request_count", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_request_queue_client.py" + }, + "3548": { + "qualifiedName": "new_pending_request_count", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_request_queue_client.py" + }, + "3549": { + "qualifiedName": "new_total_request_count", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_request_queue_client.py" + }, + "3550": { + "qualifiedName": "delta_handled_request_count", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_request_queue_client.py" + }, + "3551": { + "qualifiedName": "delta_pending_request_count", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_request_queue_client.py" + }, + "3552": { + "qualifiedName": "delta_total_request_count", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_request_queue_client.py" + }, + "3553": { + "qualifiedName": "recalculate", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_request_queue_client.py" + }, + "3554": { + "qualifiedName": "update_had_multiple_clients", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_request_queue_client.py" + }, + "3555": { + "qualifiedName": "SqlRequestQueueClient", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_request_queue_client.py" + }, + "3556": { + "qualifiedName": "__init__", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_request_queue_client.py" + }, + "3560": { + "qualifiedName": "open", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_request_queue_client.py" + }, + "3566": { + "qualifiedName": "get_metadata", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_request_queue_client.py" + }, + "3568": { + "qualifiedName": "drop", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_request_queue_client.py" + }, + "3570": { + "qualifiedName": "purge", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_request_queue_client.py" + }, + "3572": { + "qualifiedName": "add_batch_of_requests", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_request_queue_client.py" + }, + "3576": { + "qualifiedName": "get_request", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_request_queue_client.py" + }, + "3579": { + "qualifiedName": "fetch_next_request", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_request_queue_client.py" + }, + "3581": { + "qualifiedName": "mark_request_as_handled", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_request_queue_client.py" + }, + "3584": { + "qualifiedName": "reclaim_request", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_request_queue_client.py" + }, + "3588": { + "qualifiedName": "is_empty", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_request_queue_client.py" + }, + "3590": { + "qualifiedName": "logger", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_storage_client.py" + }, + "3591": { + "qualifiedName": "SqlStorageClient", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_storage_client.py" + }, + "3592": { + "qualifiedName": "__init__", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_storage_client.py" + }, + "3596": { + "qualifiedName": "__aenter__", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_storage_client.py" + }, + "3598": { + "qualifiedName": "__aexit__", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_storage_client.py" + }, + "3603": { + "qualifiedName": "engine", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_storage_client.py" + }, + "3604": { + "qualifiedName": "get_dialect_name", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_storage_client.py" + }, + "3606": { + "qualifiedName": "initialize", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_storage_client.py" + }, + "3609": { + "qualifiedName": "close", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_storage_client.py" + }, + "3611": { + "qualifiedName": "create_session", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_storage_client.py" + }, + "3613": { + "qualifiedName": "create_dataset_client", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_storage_client.py" + }, + "3619": { + "qualifiedName": "create_kvs_client", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_storage_client.py" + }, + "3625": { + "qualifiedName": "create_rq_client", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_storage_client.py" + }, + "3631": { + "qualifiedName": "KvsValueType", + "sourceFileName": "/src/crawlee/storage_clients/models.py" + }, + "3632": { + "qualifiedName": "StorageMetadata", + "sourceFileName": "/src/crawlee/storage_clients/models.py" + }, + "3633": { + "qualifiedName": "model_config", + "sourceFileName": "/src/crawlee/storage_clients/models.py" + }, + "3634": { + "qualifiedName": "id", + "sourceFileName": "/src/crawlee/storage_clients/models.py" + }, + "3635": { + "qualifiedName": "name", + "sourceFileName": "/src/crawlee/storage_clients/models.py" + }, + "3636": { + "qualifiedName": "accessed_at", + "sourceFileName": "/src/crawlee/storage_clients/models.py" + }, + "3637": { + "qualifiedName": "created_at", + "sourceFileName": "/src/crawlee/storage_clients/models.py" + }, + "3638": { + "qualifiedName": "modified_at", + "sourceFileName": "/src/crawlee/storage_clients/models.py" + }, + "3639": { + "qualifiedName": "DatasetMetadata", + "sourceFileName": "/src/crawlee/storage_clients/models.py" + }, + "3640": { + "qualifiedName": "model_config", + "sourceFileName": "/src/crawlee/storage_clients/models.py" + }, + "3641": { + "qualifiedName": "item_count", + "sourceFileName": "/src/crawlee/storage_clients/models.py" + }, + "3642": { + "qualifiedName": "KeyValueStoreMetadata", + "sourceFileName": "/src/crawlee/storage_clients/models.py" + }, + "3643": { + "qualifiedName": "model_config", + "sourceFileName": "/src/crawlee/storage_clients/models.py" + }, + "3644": { + "qualifiedName": "RequestQueueMetadata", + "sourceFileName": "/src/crawlee/storage_clients/models.py" + }, + "3645": { + "qualifiedName": "model_config", + "sourceFileName": "/src/crawlee/storage_clients/models.py" + }, + "3646": { + "qualifiedName": "had_multiple_clients", + "sourceFileName": "/src/crawlee/storage_clients/models.py" + }, + "3647": { + "qualifiedName": "handled_request_count", + "sourceFileName": "/src/crawlee/storage_clients/models.py" + }, + "3648": { + "qualifiedName": "pending_request_count", + "sourceFileName": "/src/crawlee/storage_clients/models.py" + }, + "3649": { + "qualifiedName": "total_request_count", + "sourceFileName": "/src/crawlee/storage_clients/models.py" + }, + "3650": { + "qualifiedName": "KeyValueStoreRecordMetadata", + "sourceFileName": "/src/crawlee/storage_clients/models.py" + }, + "3651": { + "qualifiedName": "model_config", + "sourceFileName": "/src/crawlee/storage_clients/models.py" + }, + "3652": { + "qualifiedName": "key", + "sourceFileName": "/src/crawlee/storage_clients/models.py" + }, + "3653": { + "qualifiedName": "content_type", + "sourceFileName": "/src/crawlee/storage_clients/models.py" + }, + "3654": { + "qualifiedName": "size", + "sourceFileName": "/src/crawlee/storage_clients/models.py" + }, + "3655": { + "qualifiedName": "KeyValueStoreRecord", + "sourceFileName": "/src/crawlee/storage_clients/models.py" + }, + "3656": { + "qualifiedName": "model_config", + "sourceFileName": "/src/crawlee/storage_clients/models.py" + }, + "3657": { + "qualifiedName": "value", + "sourceFileName": "/src/crawlee/storage_clients/models.py" + }, + "3658": { + "qualifiedName": "DatasetItemsListPage", + "sourceFileName": "/src/crawlee/storage_clients/models.py" + }, + "3659": { + "qualifiedName": "model_config", + "sourceFileName": "/src/crawlee/storage_clients/models.py" + }, + "3660": { + "qualifiedName": "count", + "sourceFileName": "/src/crawlee/storage_clients/models.py" + }, + "3661": { + "qualifiedName": "offset", + "sourceFileName": "/src/crawlee/storage_clients/models.py" + }, + "3662": { + "qualifiedName": "limit", + "sourceFileName": "/src/crawlee/storage_clients/models.py" + }, + "3663": { + "qualifiedName": "total", + "sourceFileName": "/src/crawlee/storage_clients/models.py" + }, + "3664": { + "qualifiedName": "desc", + "sourceFileName": "/src/crawlee/storage_clients/models.py" + }, + "3665": { + "qualifiedName": "ProcessedRequest", + "sourceFileName": "/src/crawlee/storage_clients/models.py" + }, + "3666": { + "qualifiedName": "model_config", + "sourceFileName": "/src/crawlee/storage_clients/models.py" + }, + "3667": { + "qualifiedName": "id", + "sourceFileName": "/src/crawlee/storage_clients/models.py" + }, + "3668": { + "qualifiedName": "unique_key", + "sourceFileName": "/src/crawlee/storage_clients/models.py" + }, + "3669": { + "qualifiedName": "was_already_present", + "sourceFileName": "/src/crawlee/storage_clients/models.py" + }, + "3670": { + "qualifiedName": "was_already_handled", + "sourceFileName": "/src/crawlee/storage_clients/models.py" + }, + "3671": { + "qualifiedName": "UnprocessedRequest", + "sourceFileName": "/src/crawlee/storage_clients/models.py" + }, + "3672": { + "qualifiedName": "model_config", + "sourceFileName": "/src/crawlee/storage_clients/models.py" + }, + "3673": { + "qualifiedName": "unique_key", + "sourceFileName": "/src/crawlee/storage_clients/models.py" + }, + "3674": { + "qualifiedName": "url", + "sourceFileName": "/src/crawlee/storage_clients/models.py" + }, + "3675": { + "qualifiedName": "method", + "sourceFileName": "/src/crawlee/storage_clients/models.py" + }, + "3676": { + "qualifiedName": "AddRequestsResponse", + "sourceFileName": "/src/crawlee/storage_clients/models.py" + }, + "3677": { + "qualifiedName": "model_config", + "sourceFileName": "/src/crawlee/storage_clients/models.py" + }, + "3678": { + "qualifiedName": "processed_requests", + "sourceFileName": "/src/crawlee/storage_clients/models.py" + }, + "3679": { + "qualifiedName": "unprocessed_requests", + "sourceFileName": "/src/crawlee/storage_clients/models.py" + }, + "3680": { + "qualifiedName": "Storage", + "sourceFileName": "/src/crawlee/storages/_base.py" + }, + "3681": { + "qualifiedName": "id", + "sourceFileName": "/src/crawlee/storages/_base.py" + }, + "3682": { + "qualifiedName": "name", + "sourceFileName": "/src/crawlee/storages/_base.py" + }, + "3683": { + "qualifiedName": "get_metadata", + "sourceFileName": "/src/crawlee/storages/_base.py" + }, + "3685": { + "qualifiedName": "open", + "sourceFileName": "/src/crawlee/storages/_base.py" + }, + "3692": { + "qualifiedName": "drop", + "sourceFileName": "/src/crawlee/storages/_base.py" + }, + "3694": { + "qualifiedName": "purge", + "sourceFileName": "/src/crawlee/storages/_base.py" + }, + "3696": { + "qualifiedName": "T", + "sourceFileName": "/src/crawlee/storages/_key_value_store.py" + }, + "3697": { + "qualifiedName": "logger", + "sourceFileName": "/src/crawlee/storages/_key_value_store.py" + }, + "3698": { + "qualifiedName": "AutosavedValue", + "sourceFileName": "/src/crawlee/storages/_key_value_store.py" + }, + "3699": { + "qualifiedName": "root", + "sourceFileName": "/src/crawlee/storages/_key_value_store.py" + }, + "3700": { + "qualifiedName": "KeyValueStore", + "sourceFileName": "/src/crawlee/storages/_key_value_store.py" + }, + "3701": { + "qualifiedName": "__init__", + "sourceFileName": "/src/crawlee/storages/_key_value_store.py" + }, + "3706": { + "qualifiedName": "id", + "sourceFileName": "/src/crawlee/storages/_key_value_store.py" + }, + "3707": { + "qualifiedName": "name", + "sourceFileName": "/src/crawlee/storages/_key_value_store.py" + }, + "3708": { + "qualifiedName": "get_metadata", + "sourceFileName": "/src/crawlee/storages/_key_value_store.py" + }, + "3710": { + "qualifiedName": "open", + "sourceFileName": "/src/crawlee/storages/_key_value_store.py" + }, + "3717": { + "qualifiedName": "drop", + "sourceFileName": "/src/crawlee/storages/_key_value_store.py" + }, + "3719": { + "qualifiedName": "purge", + "sourceFileName": "/src/crawlee/storages/_key_value_store.py" + }, + "3721": { + "qualifiedName": "get_value", + "sourceFileName": "/src/crawlee/storages/_key_value_store.py" + }, + "3725": { + "qualifiedName": "set_value", + "sourceFileName": "/src/crawlee/storages/_key_value_store.py" + }, + "3730": { + "qualifiedName": "delete_value", + "sourceFileName": "/src/crawlee/storages/_key_value_store.py" + }, + "3733": { + "qualifiedName": "iterate_keys", + "sourceFileName": "/src/crawlee/storages/_key_value_store.py" + }, + "3737": { + "qualifiedName": "list_keys", + "sourceFileName": "/src/crawlee/storages/_key_value_store.py" + }, + "3741": { + "qualifiedName": "record_exists", + "sourceFileName": "/src/crawlee/storages/_key_value_store.py" + }, + "3744": { + "qualifiedName": "get_public_url", + "sourceFileName": "/src/crawlee/storages/_key_value_store.py" + }, + "3747": { + "qualifiedName": "get_auto_saved_value", + "sourceFileName": "/src/crawlee/storages/_key_value_store.py" + }, + "3751": { + "qualifiedName": "persist_autosaved_values", + "sourceFileName": "/src/crawlee/storages/_key_value_store.py" + }, + "3761": { + "qualifiedName": "NAME_REGEX", + "sourceFileName": "/src/crawlee/storages/_utils.py" + }, + "3762": { + "qualifiedName": "validate_storage_name", + "sourceFileName": "/src/crawlee/storages/_utils.py" + }, + "3765": { + "qualifiedName": "logger", + "sourceFileName": "/src/crawlee/storages/_dataset.py" + }, + "3766": { + "qualifiedName": "Dataset", + "sourceFileName": "/src/crawlee/storages/_dataset.py" + }, + "3767": { + "qualifiedName": "__init__", + "sourceFileName": "/src/crawlee/storages/_dataset.py" + }, + "3772": { + "qualifiedName": "id", + "sourceFileName": "/src/crawlee/storages/_dataset.py" + }, + "3773": { + "qualifiedName": "name", + "sourceFileName": "/src/crawlee/storages/_dataset.py" + }, + "3774": { + "qualifiedName": "get_metadata", + "sourceFileName": "/src/crawlee/storages/_dataset.py" + }, + "3776": { + "qualifiedName": "open", + "sourceFileName": "/src/crawlee/storages/_dataset.py" + }, + "3783": { + "qualifiedName": "drop", + "sourceFileName": "/src/crawlee/storages/_dataset.py" + }, + "3785": { + "qualifiedName": "purge", + "sourceFileName": "/src/crawlee/storages/_dataset.py" + }, + "3787": { + "qualifiedName": "push_data", + "sourceFileName": "/src/crawlee/storages/_dataset.py" + }, + "3790": { + "qualifiedName": "get_data", + "sourceFileName": "/src/crawlee/storages/_dataset.py" + }, + "3803": { + "qualifiedName": "iterate_items", + "sourceFileName": "/src/crawlee/storages/_dataset.py" + }, + "3814": { + "qualifiedName": "list_items", + "sourceFileName": "/src/crawlee/storages/_dataset.py" + }, + "3825": { + "qualifiedName": "export_to", + "sourceFileName": "/src/crawlee/storages/_dataset.py" + }, + "3850": { + "qualifiedName": "logger", + "sourceFileName": "/src/crawlee/storages/_request_queue.py" + }, + "3851": { + "qualifiedName": "T", + "sourceFileName": "/src/crawlee/storages/_request_queue.py" + }, + "3852": { + "qualifiedName": "RequestQueue", + "sourceFileName": "/src/crawlee/storages/_request_queue.py" + }, + "3853": { + "qualifiedName": "__init__", + "sourceFileName": "/src/crawlee/storages/_request_queue.py" + }, + "3858": { + "qualifiedName": "id", + "sourceFileName": "/src/crawlee/storages/_request_queue.py" + }, + "3859": { + "qualifiedName": "name", + "sourceFileName": "/src/crawlee/storages/_request_queue.py" + }, + "3860": { + "qualifiedName": "get_metadata", + "sourceFileName": "/src/crawlee/storages/_request_queue.py" + }, + "3862": { + "qualifiedName": "get_handled_count", + "sourceFileName": "/src/crawlee/storages/_request_queue.py" + }, + "3864": { + "qualifiedName": "get_total_count", + "sourceFileName": "/src/crawlee/storages/_request_queue.py" + }, + "3866": { + "qualifiedName": "open", + "sourceFileName": "/src/crawlee/storages/_request_queue.py" + }, + "3873": { + "qualifiedName": "drop", + "sourceFileName": "/src/crawlee/storages/_request_queue.py" + }, + "3875": { + "qualifiedName": "purge", + "sourceFileName": "/src/crawlee/storages/_request_queue.py" + }, + "3877": { + "qualifiedName": "add_request", + "sourceFileName": "/src/crawlee/storages/_request_queue.py" + }, + "3881": { + "qualifiedName": "add_requests", + "sourceFileName": "/src/crawlee/storages/_request_queue.py" + }, + "3889": { + "qualifiedName": "fetch_next_request", + "sourceFileName": "/src/crawlee/storages/_request_queue.py" + }, + "3891": { + "qualifiedName": "get_request", + "sourceFileName": "/src/crawlee/storages/_request_queue.py" + }, + "3894": { + "qualifiedName": "mark_request_as_handled", + "sourceFileName": "/src/crawlee/storages/_request_queue.py" + }, + "3897": { + "qualifiedName": "reclaim_request", + "sourceFileName": "/src/crawlee/storages/_request_queue.py" + }, + "3901": { + "qualifiedName": "is_empty", + "sourceFileName": "/src/crawlee/storages/_request_queue.py" + }, + "3903": { + "qualifiedName": "is_finished", + "sourceFileName": "/src/crawlee/storages/_request_queue.py" + }, + "3905": { + "qualifiedName": "T", + "sourceFileName": "/src/crawlee/storages/_storage_instance_manager.py" + }, + "3906": { + "qualifiedName": "_StorageCache", + "sourceFileName": "/src/crawlee/storages/_storage_instance_manager.py" + }, + "3907": { + "qualifiedName": "by_id", + "sourceFileName": "/src/crawlee/storages/_storage_instance_manager.py" + }, + "3908": { + "qualifiedName": "by_name", + "sourceFileName": "/src/crawlee/storages/_storage_instance_manager.py" + }, + "3909": { + "qualifiedName": "by_alias", + "sourceFileName": "/src/crawlee/storages/_storage_instance_manager.py" + }, + "3910": { + "qualifiedName": "remove_from_cache", + "sourceFileName": "/src/crawlee/storages/_storage_instance_manager.py" + }, + "3913": { + "qualifiedName": "ClientOpenerCoro", + "sourceFileName": "/src/crawlee/storages/_storage_instance_manager.py" + }, + "3914": { + "qualifiedName": "StorageInstanceManager", + "sourceFileName": "/src/crawlee/storages/_storage_instance_manager.py" + }, + "3915": { + "qualifiedName": "__init__", + "sourceFileName": "/src/crawlee/storages/_storage_instance_manager.py" + }, + "3917": { + "qualifiedName": "open_storage_instance", + "sourceFileName": "/src/crawlee/storages/_storage_instance_manager.py" + }, + "3924": { + "qualifiedName": "remove_from_cache", + "sourceFileName": "/src/crawlee/storages/_storage_instance_manager.py" + }, + "3927": { + "qualifiedName": "clear_cache", + "sourceFileName": "/src/crawlee/storages/_storage_instance_manager.py" + }, + "3929": { + "qualifiedName": "key", + "sourceFileName": "/src/crawlee/storage_clients/models.py" + }, + "3930": { + "qualifiedName": "content_type", + "sourceFileName": "/src/crawlee/storage_clients/models.py" + }, + "3931": { + "qualifiedName": "size", + "sourceFileName": "/src/crawlee/storage_clients/models.py" + }, + "3932": { + "qualifiedName": "id", + "sourceFileName": "/src/crawlee/storage_clients/models.py" + }, + "3933": { + "qualifiedName": "name", + "sourceFileName": "/src/crawlee/storage_clients/models.py" + }, + "3934": { + "qualifiedName": "accessed_at", + "sourceFileName": "/src/crawlee/storage_clients/models.py" + }, + "3935": { + "qualifiedName": "created_at", + "sourceFileName": "/src/crawlee/storage_clients/models.py" + }, + "3936": { + "qualifiedName": "modified_at", + "sourceFileName": "/src/crawlee/storage_clients/models.py" + }, + "3937": { + "qualifiedName": "id", + "sourceFileName": "/src/crawlee/storage_clients/models.py" + }, + "3938": { + "qualifiedName": "name", + "sourceFileName": "/src/crawlee/storage_clients/models.py" + }, + "3939": { + "qualifiedName": "accessed_at", + "sourceFileName": "/src/crawlee/storage_clients/models.py" + }, + "3940": { + "qualifiedName": "created_at", + "sourceFileName": "/src/crawlee/storage_clients/models.py" + }, + "3941": { + "qualifiedName": "modified_at", + "sourceFileName": "/src/crawlee/storage_clients/models.py" + }, + "3942": { + "qualifiedName": "id", + "sourceFileName": "/src/crawlee/storage_clients/models.py" + }, + "3943": { + "qualifiedName": "name", + "sourceFileName": "/src/crawlee/storage_clients/models.py" + }, + "3944": { + "qualifiedName": "accessed_at", + "sourceFileName": "/src/crawlee/storage_clients/models.py" + }, + "3945": { + "qualifiedName": "created_at", + "sourceFileName": "/src/crawlee/storage_clients/models.py" + }, + "3946": { + "qualifiedName": "modified_at", + "sourceFileName": "/src/crawlee/storage_clients/models.py" + }, + "3947": { + "qualifiedName": "id", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3948": { + "qualifiedName": "accessed_at", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3949": { + "qualifiedName": "modified_at", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3950": { + "qualifiedName": "id", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3951": { + "qualifiedName": "accessed_at", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3952": { + "qualifiedName": "modified_at", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3953": { + "qualifiedName": "id", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3954": { + "qualifiedName": "accessed_at", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3955": { + "qualifiedName": "modified_at", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3956": { + "qualifiedName": "internal_name", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3957": { + "qualifiedName": "name", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3958": { + "qualifiedName": "accessed_at", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3959": { + "qualifiedName": "created_at", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3960": { + "qualifiedName": "modified_at", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3961": { + "qualifiedName": "buffer_locked_until", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3962": { + "qualifiedName": "internal_name", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3963": { + "qualifiedName": "name", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3964": { + "qualifiedName": "accessed_at", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3965": { + "qualifiedName": "created_at", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3966": { + "qualifiedName": "modified_at", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3967": { + "qualifiedName": "buffer_locked_until", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3968": { + "qualifiedName": "internal_name", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3969": { + "qualifiedName": "name", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3970": { + "qualifiedName": "accessed_at", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3971": { + "qualifiedName": "created_at", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3972": { + "qualifiedName": "modified_at", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3973": { + "qualifiedName": "buffer_locked_until", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_db_models.py" + }, + "3974": { + "qualifiedName": "get_session", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_client_mixin.py" + }, + "3975": { + "qualifiedName": "get_session", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_client_mixin.py" + }, + "3976": { + "qualifiedName": "get_session", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_client_mixin.py" + }, + "3977": { + "qualifiedName": "redis", + "sourceFileName": "/src/crawlee/storage_clients/_redis/_client_mixin.py" + }, + "3978": { + "qualifiedName": "metadata_key", + "sourceFileName": "/src/crawlee/storage_clients/_redis/_client_mixin.py" + }, + "3979": { + "qualifiedName": "redis", + "sourceFileName": "/src/crawlee/storage_clients/_redis/_client_mixin.py" + }, + "3980": { + "qualifiedName": "metadata_key", + "sourceFileName": "/src/crawlee/storage_clients/_redis/_client_mixin.py" + }, + "3981": { + "qualifiedName": "redis", + "sourceFileName": "/src/crawlee/storage_clients/_redis/_client_mixin.py" + }, + "3982": { + "qualifiedName": "metadata_key", + "sourceFileName": "/src/crawlee/storage_clients/_redis/_client_mixin.py" + }, + "3983": { + "qualifiedName": "accessed_at", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_client_mixin.py" + }, + "3984": { + "qualifiedName": "modified_at", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_client_mixin.py" + }, + "3985": { + "qualifiedName": "accessed_at", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_client_mixin.py" + }, + "3986": { + "qualifiedName": "modified_at", + "sourceFileName": "/src/crawlee/storage_clients/_sql/_client_mixin.py" + }, + "3987": { + "qualifiedName": "get_rate_limit_errors", + "sourceFileName": "/src/crawlee/storage_clients/_base/_storage_client.py" + }, + "3988": { + "qualifiedName": "get_storage_client_cache_key", + "sourceFileName": "/src/crawlee/storage_clients/_base/_storage_client.py" + }, + "3989": { + "qualifiedName": "get_rate_limit_errors", + "sourceFileName": "/src/crawlee/storage_clients/_base/_storage_client.py" + }, + "3990": { + "qualifiedName": "get_storage_client_cache_key", + "sourceFileName": "/src/crawlee/storage_clients/_base/_storage_client.py" + }, + "3991": { + "qualifiedName": "get_rate_limit_errors", + "sourceFileName": "/src/crawlee/storage_clients/_base/_storage_client.py" + }, + "3992": { + "qualifiedName": "get_storage_client_cache_key", + "sourceFileName": "/src/crawlee/storage_clients/_base/_storage_client.py" + }, + "3993": { + "qualifiedName": "get_rate_limit_errors", + "sourceFileName": "/src/crawlee/storage_clients/_base/_storage_client.py" + }, + "3994": { + "qualifiedName": "to_tandem", + "sourceFileName": "/src/crawlee/request_loaders/_request_loader.py" + }, + "3995": { + "qualifiedName": "get_handled_count", + "sourceFileName": "/src/crawlee/request_loaders/_request_loader.py" + }, + "3996": { + "qualifiedName": "get_total_count", + "sourceFileName": "/src/crawlee/request_loaders/_request_loader.py" + }, + "3997": { + "qualifiedName": "is_empty", + "sourceFileName": "/src/crawlee/request_loaders/_request_loader.py" + }, + "3998": { + "qualifiedName": "is_finished", + "sourceFileName": "/src/crawlee/request_loaders/_request_loader.py" + }, + "3999": { + "qualifiedName": "fetch_next_request", + "sourceFileName": "/src/crawlee/request_loaders/_request_loader.py" + }, + "4000": { + "qualifiedName": "mark_request_as_handled", + "sourceFileName": "/src/crawlee/request_loaders/_request_loader.py" + }, + "4001": { + "qualifiedName": "to_tandem", + "sourceFileName": "/src/crawlee/request_loaders/_request_loader.py" + }, + "4002": { + "qualifiedName": "to_tandem", + "sourceFileName": "/src/crawlee/request_loaders/_request_loader.py" + }, + "4003": { + "qualifiedName": "to_tandem", + "sourceFileName": "/src/crawlee/request_loaders/_request_loader.py" + }, + "4004": { + "qualifiedName": "to_tandem", + "sourceFileName": "/src/crawlee/request_loaders/_request_loader.py" + }, + "4005": { + "qualifiedName": "active", + "sourceFileName": "/src/crawlee/events/_event_manager.py" + }, + "4006": { + "qualifiedName": "on", + "sourceFileName": "/src/crawlee/events/_event_manager.py" + }, + "4007": { + "qualifiedName": "off", + "sourceFileName": "/src/crawlee/events/_event_manager.py" + }, + "4008": { + "qualifiedName": "emit", + "sourceFileName": "/src/crawlee/events/_event_manager.py" + }, + "4009": { + "qualifiedName": "wait_for_all_listeners_to_complete", + "sourceFileName": "/src/crawlee/events/_event_manager.py" + }, + "4010": { + "qualifiedName": "browser_pool", + "sourceFileName": "/src/crawlee/crawlers/_playwright/_playwright_crawler.py" + }, + "4011": { + "qualifiedName": "browser_type", + "sourceFileName": "/src/crawlee/crawlers/_playwright/_playwright_crawler.py" + }, + "4012": { + "qualifiedName": "browser_launch_options", + "sourceFileName": "/src/crawlee/crawlers/_playwright/_playwright_crawler.py" + }, + "4013": { + "qualifiedName": "browser_new_context_options", + "sourceFileName": "/src/crawlee/crawlers/_playwright/_playwright_crawler.py" + }, + "4014": { + "qualifiedName": "headless", + "sourceFileName": "/src/crawlee/crawlers/_playwright/_playwright_crawler.py" + }, + "4015": { + "qualifiedName": "active", + "sourceFileName": "/src/crawlee/http_clients/_base.py" + }, + "4016": { + "qualifiedName": "__aenter__", + "sourceFileName": "/src/crawlee/http_clients/_base.py" + }, + "4017": { + "qualifiedName": "__aexit__", + "sourceFileName": "/src/crawlee/http_clients/_base.py" + }, + "4018": { + "qualifiedName": "active", + "sourceFileName": "/src/crawlee/http_clients/_base.py" + }, + "4019": { + "qualifiedName": "__aenter__", + "sourceFileName": "/src/crawlee/http_clients/_base.py" + }, + "4020": { + "qualifiedName": "__aexit__", + "sourceFileName": "/src/crawlee/http_clients/_base.py" + }, + "4021": { + "qualifiedName": "active", + "sourceFileName": "/src/crawlee/http_clients/_base.py" + }, + "4022": { + "qualifiedName": "__aenter__", + "sourceFileName": "/src/crawlee/http_clients/_base.py" + }, + "4023": { + "qualifiedName": "__aexit__", + "sourceFileName": "/src/crawlee/http_clients/_base.py" + }, + "4024": { + "qualifiedName": "active", + "sourceFileName": "/src/crawlee/http_clients/_base.py" + }, + "4025": { + "qualifiedName": "__aenter__", + "sourceFileName": "/src/crawlee/http_clients/_base.py" + }, + "4026": { + "qualifiedName": "__aexit__", + "sourceFileName": "/src/crawlee/http_clients/_base.py" + }, + "4027": { + "qualifiedName": "is_blocked", + "sourceFileName": "/src/crawlee/crawlers/_abstract_http/_abstract_http_parser.py" + }, + "4028": { + "qualifiedName": "is_blocked", + "sourceFileName": "/src/crawlee/crawlers/_abstract_http/_abstract_http_parser.py" + }, + "4029": { + "qualifiedName": "request_handler", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4030": { + "qualifiedName": "statistics", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4031": { + "qualifiedName": "configuration", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4032": { + "qualifiedName": "event_manager", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4033": { + "qualifiedName": "storage_client", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4034": { + "qualifiedName": "request_manager", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4035": { + "qualifiedName": "session_pool", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4036": { + "qualifiedName": "proxy_configuration", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4037": { + "qualifiedName": "http_client", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4038": { + "qualifiedName": "max_request_retries", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4039": { + "qualifiedName": "max_requests_per_crawl", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4040": { + "qualifiedName": "max_session_rotations", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4041": { + "qualifiedName": "max_crawl_depth", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4042": { + "qualifiedName": "use_session_pool", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4043": { + "qualifiedName": "retry_on_blocked", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4044": { + "qualifiedName": "concurrency_settings", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4045": { + "qualifiedName": "request_handler_timeout", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4046": { + "qualifiedName": "abort_on_error", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4047": { + "qualifiedName": "configure_logging", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4048": { + "qualifiedName": "statistics_log_format", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4049": { + "qualifiedName": "keep_alive", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4050": { + "qualifiedName": "additional_http_error_status_codes", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4051": { + "qualifiedName": "ignore_http_error_status_codes", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4052": { + "qualifiedName": "respect_robots_txt_file", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4053": { + "qualifiedName": "status_message_logging_interval", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4054": { + "qualifiedName": "status_message_callback", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4055": { + "qualifiedName": "id", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4056": { + "qualifiedName": "replace_state_model", + "sourceFileName": "/src/crawlee/statistics/_statistics.py" + }, + "4057": { + "qualifiedName": "with_default_state", + "sourceFileName": "/src/crawlee/statistics/_statistics.py" + }, + "4058": { + "qualifiedName": "active", + "sourceFileName": "/src/crawlee/statistics/_statistics.py" + }, + "4059": { + "qualifiedName": "state", + "sourceFileName": "/src/crawlee/statistics/_statistics.py" + }, + "4060": { + "qualifiedName": "register_status_code", + "sourceFileName": "/src/crawlee/statistics/_statistics.py" + }, + "4061": { + "qualifiedName": "record_request_processing_start", + "sourceFileName": "/src/crawlee/statistics/_statistics.py" + }, + "4062": { + "qualifiedName": "record_request_processing_finish", + "sourceFileName": "/src/crawlee/statistics/_statistics.py" + }, + "4063": { + "qualifiedName": "record_request_processing_failure", + "sourceFileName": "/src/crawlee/statistics/_statistics.py" + }, + "4064": { + "qualifiedName": "calculate", + "sourceFileName": "/src/crawlee/statistics/_statistics.py" + }, + "4065": { + "qualifiedName": "reset", + "sourceFileName": "/src/crawlee/statistics/_statistics.py" + }, + "4066": { + "qualifiedName": "__aenter__", + "sourceFileName": "/src/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py" + }, + "4067": { + "qualifiedName": "__aexit__", + "sourceFileName": "/src/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py" + }, + "4068": { + "qualifiedName": "stats_id", + "sourceFileName": "/src/crawlee/statistics/_models.py" + }, + "4069": { + "qualifiedName": "requests_finished", + "sourceFileName": "/src/crawlee/statistics/_models.py" + }, + "4070": { + "qualifiedName": "requests_failed", + "sourceFileName": "/src/crawlee/statistics/_models.py" + }, + "4071": { + "qualifiedName": "requests_retries", + "sourceFileName": "/src/crawlee/statistics/_models.py" + }, + "4072": { + "qualifiedName": "requests_failed_per_minute", + "sourceFileName": "/src/crawlee/statistics/_models.py" + }, + "4073": { + "qualifiedName": "requests_finished_per_minute", + "sourceFileName": "/src/crawlee/statistics/_models.py" + }, + "4074": { + "qualifiedName": "request_min_duration", + "sourceFileName": "/src/crawlee/statistics/_models.py" + }, + "4075": { + "qualifiedName": "request_max_duration", + "sourceFileName": "/src/crawlee/statistics/_models.py" + }, + "4076": { + "qualifiedName": "request_total_failed_duration", + "sourceFileName": "/src/crawlee/statistics/_models.py" + }, + "4077": { + "qualifiedName": "request_total_finished_duration", + "sourceFileName": "/src/crawlee/statistics/_models.py" + }, + "4078": { + "qualifiedName": "crawler_started_at", + "sourceFileName": "/src/crawlee/statistics/_models.py" + }, + "4079": { + "qualifiedName": "crawler_last_started_at", + "sourceFileName": "/src/crawlee/statistics/_models.py" + }, + "4080": { + "qualifiedName": "crawler_finished_at", + "sourceFileName": "/src/crawlee/statistics/_models.py" + }, + "4081": { + "qualifiedName": "stats_persisted_at", + "sourceFileName": "/src/crawlee/statistics/_models.py" + }, + "4082": { + "qualifiedName": "request_retry_histogram", + "sourceFileName": "/src/crawlee/statistics/_models.py" + }, + "4083": { + "qualifiedName": "model_post_init", + "sourceFileName": "/src/crawlee/statistics/_models.py" + }, + "4084": { + "qualifiedName": "crawler_runtime", + "sourceFileName": "/src/crawlee/statistics/_models.py" + }, + "4085": { + "qualifiedName": "crawler_runtime_for_serialization", + "sourceFileName": "/src/crawlee/statistics/_models.py" + }, + "4086": { + "qualifiedName": "request_total_duration", + "sourceFileName": "/src/crawlee/statistics/_models.py" + }, + "4087": { + "qualifiedName": "request_avg_failed_duration", + "sourceFileName": "/src/crawlee/statistics/_models.py" + }, + "4088": { + "qualifiedName": "request_avg_finished_duration", + "sourceFileName": "/src/crawlee/statistics/_models.py" + }, + "4089": { + "qualifiedName": "requests_total", + "sourceFileName": "/src/crawlee/statistics/_models.py" + }, + "4090": { + "qualifiedName": "log", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4091": { + "qualifiedName": "router", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4092": { + "qualifiedName": "statistics", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4093": { + "qualifiedName": "stop", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4094": { + "qualifiedName": "get_request_manager", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4095": { + "qualifiedName": "get_dataset", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4096": { + "qualifiedName": "get_key_value_store", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4097": { + "qualifiedName": "error_handler", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4098": { + "qualifiedName": "failed_request_handler", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4099": { + "qualifiedName": "on_skipped_request", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4100": { + "qualifiedName": "run", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4101": { + "qualifiedName": "add_requests", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4102": { + "qualifiedName": "use_state", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4103": { + "qualifiedName": "get_data", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4104": { + "qualifiedName": "export_data", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4105": { + "qualifiedName": "log", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4106": { + "qualifiedName": "router", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4107": { + "qualifiedName": "statistics", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4108": { + "qualifiedName": "stop", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4109": { + "qualifiedName": "get_request_manager", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4110": { + "qualifiedName": "get_dataset", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4111": { + "qualifiedName": "get_key_value_store", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4112": { + "qualifiedName": "error_handler", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4113": { + "qualifiedName": "failed_request_handler", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4114": { + "qualifiedName": "on_skipped_request", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4115": { + "qualifiedName": "run", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4116": { + "qualifiedName": "add_requests", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4117": { + "qualifiedName": "use_state", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4118": { + "qualifiedName": "get_data", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4119": { + "qualifiedName": "export_data", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4120": { + "qualifiedName": "log", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4121": { + "qualifiedName": "router", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4122": { + "qualifiedName": "statistics", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4123": { + "qualifiedName": "stop", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4124": { + "qualifiedName": "get_request_manager", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4125": { + "qualifiedName": "get_dataset", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4126": { + "qualifiedName": "get_key_value_store", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4127": { + "qualifiedName": "error_handler", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4128": { + "qualifiedName": "failed_request_handler", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4129": { + "qualifiedName": "on_skipped_request", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4130": { + "qualifiedName": "run", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4131": { + "qualifiedName": "add_requests", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4132": { + "qualifiedName": "use_state", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4133": { + "qualifiedName": "get_data", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4134": { + "qualifiedName": "export_data", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4135": { + "qualifiedName": "create_parsed_http_crawler_class", + "sourceFileName": "/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py" + }, + "4136": { + "qualifiedName": "pre_navigation_hook", + "sourceFileName": "/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py" + }, + "4137": { + "qualifiedName": "post_navigation_hook", + "sourceFileName": "/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py" + }, + "4138": { + "qualifiedName": "log", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4139": { + "qualifiedName": "router", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4140": { + "qualifiedName": "statistics", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4141": { + "qualifiedName": "stop", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4142": { + "qualifiedName": "get_request_manager", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4143": { + "qualifiedName": "get_dataset", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4144": { + "qualifiedName": "get_key_value_store", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4145": { + "qualifiedName": "error_handler", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4146": { + "qualifiedName": "failed_request_handler", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4147": { + "qualifiedName": "on_skipped_request", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4148": { + "qualifiedName": "run", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4149": { + "qualifiedName": "add_requests", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4150": { + "qualifiedName": "use_state", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4151": { + "qualifiedName": "get_data", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4152": { + "qualifiedName": "export_data", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4153": { + "qualifiedName": "create_parsed_http_crawler_class", + "sourceFileName": "/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py" + }, + "4154": { + "qualifiedName": "pre_navigation_hook", + "sourceFileName": "/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py" + }, + "4155": { + "qualifiedName": "post_navigation_hook", + "sourceFileName": "/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py" + }, + "4156": { + "qualifiedName": "log", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4157": { + "qualifiedName": "router", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4158": { + "qualifiedName": "statistics", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4159": { + "qualifiedName": "stop", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4160": { + "qualifiedName": "get_request_manager", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4161": { + "qualifiedName": "get_dataset", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4162": { + "qualifiedName": "get_key_value_store", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4163": { + "qualifiedName": "error_handler", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4164": { + "qualifiedName": "failed_request_handler", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4165": { + "qualifiedName": "on_skipped_request", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4166": { + "qualifiedName": "run", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4167": { + "qualifiedName": "add_requests", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4168": { + "qualifiedName": "use_state", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4169": { + "qualifiedName": "get_data", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4170": { + "qualifiedName": "export_data", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4171": { + "qualifiedName": "create_parsed_http_crawler_class", + "sourceFileName": "/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py" + }, + "4172": { + "qualifiedName": "pre_navigation_hook", + "sourceFileName": "/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py" + }, + "4173": { + "qualifiedName": "post_navigation_hook", + "sourceFileName": "/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py" + }, + "4174": { + "qualifiedName": "log", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4175": { + "qualifiedName": "router", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4176": { + "qualifiedName": "statistics", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4177": { + "qualifiedName": "stop", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4178": { + "qualifiedName": "get_request_manager", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4179": { + "qualifiedName": "get_dataset", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4180": { + "qualifiedName": "get_key_value_store", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4181": { + "qualifiedName": "error_handler", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4182": { + "qualifiedName": "failed_request_handler", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4183": { + "qualifiedName": "on_skipped_request", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4184": { + "qualifiedName": "run", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4185": { + "qualifiedName": "add_requests", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4186": { + "qualifiedName": "use_state", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4187": { + "qualifiedName": "get_data", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4188": { + "qualifiedName": "export_data", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4189": { + "qualifiedName": "request_handler", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4190": { + "qualifiedName": "statistics", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4191": { + "qualifiedName": "configuration", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4192": { + "qualifiedName": "event_manager", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4193": { + "qualifiedName": "storage_client", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4194": { + "qualifiedName": "request_manager", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4195": { + "qualifiedName": "session_pool", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4196": { + "qualifiedName": "proxy_configuration", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4197": { + "qualifiedName": "http_client", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4198": { + "qualifiedName": "max_request_retries", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4199": { + "qualifiedName": "max_requests_per_crawl", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4200": { + "qualifiedName": "max_session_rotations", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4201": { + "qualifiedName": "max_crawl_depth", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4202": { + "qualifiedName": "use_session_pool", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4203": { + "qualifiedName": "retry_on_blocked", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4204": { + "qualifiedName": "concurrency_settings", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4205": { + "qualifiedName": "request_handler_timeout", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4206": { + "qualifiedName": "abort_on_error", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4207": { + "qualifiedName": "configure_logging", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4208": { + "qualifiedName": "statistics_log_format", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4209": { + "qualifiedName": "keep_alive", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4210": { + "qualifiedName": "additional_http_error_status_codes", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4211": { + "qualifiedName": "ignore_http_error_status_codes", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4212": { + "qualifiedName": "respect_robots_txt_file", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4213": { + "qualifiedName": "status_message_logging_interval", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4214": { + "qualifiedName": "status_message_callback", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4215": { + "qualifiedName": "id", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4216": { + "qualifiedName": "request_handler", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4217": { + "qualifiedName": "statistics", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4218": { + "qualifiedName": "configuration", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4219": { + "qualifiedName": "event_manager", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4220": { + "qualifiedName": "storage_client", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4221": { + "qualifiedName": "request_manager", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4222": { + "qualifiedName": "session_pool", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4223": { + "qualifiedName": "proxy_configuration", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4224": { + "qualifiedName": "http_client", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4225": { + "qualifiedName": "max_request_retries", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4226": { + "qualifiedName": "max_requests_per_crawl", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4227": { + "qualifiedName": "max_session_rotations", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4228": { + "qualifiedName": "max_crawl_depth", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4229": { + "qualifiedName": "use_session_pool", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4230": { + "qualifiedName": "retry_on_blocked", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4231": { + "qualifiedName": "concurrency_settings", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4232": { + "qualifiedName": "request_handler_timeout", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4233": { + "qualifiedName": "abort_on_error", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4234": { + "qualifiedName": "configure_logging", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4235": { + "qualifiedName": "statistics_log_format", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4236": { + "qualifiedName": "keep_alive", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4237": { + "qualifiedName": "additional_http_error_status_codes", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4238": { + "qualifiedName": "ignore_http_error_status_codes", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4239": { + "qualifiedName": "respect_robots_txt_file", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4240": { + "qualifiedName": "status_message_logging_interval", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4241": { + "qualifiedName": "status_message_callback", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4242": { + "qualifiedName": "id", + "sourceFileName": "/src/crawlee/crawlers/_basic/_basic_crawler.py" + }, + "4243": { + "qualifiedName": "http_response", + "sourceFileName": "/src/crawlee/http_clients/_base.py" + }, + "4244": { + "qualifiedName": "request", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4245": { + "qualifiedName": "session", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4246": { + "qualifiedName": "proxy_info", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4247": { + "qualifiedName": "send_request", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4248": { + "qualifiedName": "add_requests", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4249": { + "qualifiedName": "push_data", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4250": { + "qualifiedName": "use_state", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4251": { + "qualifiedName": "get_key_value_store", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4252": { + "qualifiedName": "log", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4253": { + "qualifiedName": "__hash__", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4254": { + "qualifiedName": "create_modified_copy", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4255": { + "qualifiedName": "request", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4256": { + "qualifiedName": "session", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4257": { + "qualifiedName": "proxy_info", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4258": { + "qualifiedName": "send_request", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4259": { + "qualifiedName": "add_requests", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4260": { + "qualifiedName": "push_data", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4261": { + "qualifiedName": "use_state", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4262": { + "qualifiedName": "get_key_value_store", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4263": { + "qualifiedName": "log", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4264": { + "qualifiedName": "get_snapshot", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4265": { + "qualifiedName": "__hash__", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4266": { + "qualifiedName": "create_modified_copy", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4267": { + "qualifiedName": "request", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4268": { + "qualifiedName": "session", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4269": { + "qualifiedName": "proxy_info", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4270": { + "qualifiedName": "send_request", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4271": { + "qualifiedName": "add_requests", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4272": { + "qualifiedName": "push_data", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4273": { + "qualifiedName": "use_state", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4274": { + "qualifiedName": "get_key_value_store", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4275": { + "qualifiedName": "log", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4276": { + "qualifiedName": "__hash__", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4277": { + "qualifiedName": "create_modified_copy", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4278": { + "qualifiedName": "page", + "sourceFileName": "/src/crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py" + }, + "4279": { + "qualifiedName": "block_requests", + "sourceFileName": "/src/crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py" + }, + "4280": { + "qualifiedName": "goto_options", + "sourceFileName": "/src/crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py" + }, + "4281": { + "qualifiedName": "get_snapshot", + "sourceFileName": "/src/crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py" + }, + "4282": { + "qualifiedName": "request", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4283": { + "qualifiedName": "session", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4284": { + "qualifiedName": "proxy_info", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4285": { + "qualifiedName": "send_request", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4286": { + "qualifiedName": "add_requests", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4287": { + "qualifiedName": "push_data", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4288": { + "qualifiedName": "use_state", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4289": { + "qualifiedName": "get_key_value_store", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4290": { + "qualifiedName": "log", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4291": { + "qualifiedName": "__hash__", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4292": { + "qualifiedName": "create_modified_copy", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4293": { + "qualifiedName": "response", + "sourceFileName": "/src/crawlee/crawlers/_playwright/_playwright_post_nav_crawling_context.py" + }, + "4294": { + "qualifiedName": "page", + "sourceFileName": "/src/crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py" + }, + "4295": { + "qualifiedName": "block_requests", + "sourceFileName": "/src/crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py" + }, + "4296": { + "qualifiedName": "goto_options", + "sourceFileName": "/src/crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py" + }, + "4297": { + "qualifiedName": "get_snapshot", + "sourceFileName": "/src/crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py" + }, + "4298": { + "qualifiedName": "request", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4299": { + "qualifiedName": "session", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4300": { + "qualifiedName": "proxy_info", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4301": { + "qualifiedName": "send_request", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4302": { + "qualifiedName": "add_requests", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4303": { + "qualifiedName": "push_data", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4304": { + "qualifiedName": "use_state", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4305": { + "qualifiedName": "get_key_value_store", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4306": { + "qualifiedName": "log", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4307": { + "qualifiedName": "__hash__", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4308": { + "qualifiedName": "create_modified_copy", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4309": { + "qualifiedName": "from_basic_crawling_context", + "sourceFileName": "/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py" + }, + "4310": { + "qualifiedName": "get_snapshot", + "sourceFileName": "/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py" + }, + "4311": { + "qualifiedName": "http_response", + "sourceFileName": "/src/crawlee/http_clients/_base.py" + }, + "4312": { + "qualifiedName": "request", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4313": { + "qualifiedName": "session", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4314": { + "qualifiedName": "proxy_info", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4315": { + "qualifiedName": "send_request", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4316": { + "qualifiedName": "add_requests", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4317": { + "qualifiedName": "push_data", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4318": { + "qualifiedName": "use_state", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4319": { + "qualifiedName": "get_key_value_store", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4320": { + "qualifiedName": "log", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4321": { + "qualifiedName": "__hash__", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4322": { + "qualifiedName": "create_modified_copy", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4323": { + "qualifiedName": "from_basic_crawling_context", + "sourceFileName": "/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py" + }, + "4324": { + "qualifiedName": "get_snapshot", + "sourceFileName": "/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py" + }, + "4325": { + "qualifiedName": "http_response", + "sourceFileName": "/src/crawlee/http_clients/_base.py" + }, + "4326": { + "qualifiedName": "request", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4327": { + "qualifiedName": "session", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4328": { + "qualifiedName": "proxy_info", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4329": { + "qualifiedName": "send_request", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4330": { + "qualifiedName": "add_requests", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4331": { + "qualifiedName": "push_data", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4332": { + "qualifiedName": "use_state", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4333": { + "qualifiedName": "get_key_value_store", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4334": { + "qualifiedName": "log", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4335": { + "qualifiedName": "__hash__", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4336": { + "qualifiedName": "create_modified_copy", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4337": { + "qualifiedName": "parsed_content", + "sourceFileName": "/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py" + }, + "4338": { + "qualifiedName": "enqueue_links", + "sourceFileName": "/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py" + }, + "4339": { + "qualifiedName": "extract_links", + "sourceFileName": "/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py" + }, + "4340": { + "qualifiedName": "from_http_crawling_context", + "sourceFileName": "/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py" + }, + "4341": { + "qualifiedName": "from_basic_crawling_context", + "sourceFileName": "/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py" + }, + "4342": { + "qualifiedName": "get_snapshot", + "sourceFileName": "/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py" + }, + "4343": { + "qualifiedName": "http_response", + "sourceFileName": "/src/crawlee/http_clients/_base.py" + }, + "4344": { + "qualifiedName": "request", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4345": { + "qualifiedName": "session", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4346": { + "qualifiedName": "proxy_info", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4347": { + "qualifiedName": "send_request", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4348": { + "qualifiedName": "add_requests", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4349": { + "qualifiedName": "push_data", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4350": { + "qualifiedName": "use_state", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4351": { + "qualifiedName": "get_key_value_store", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4352": { + "qualifiedName": "log", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4353": { + "qualifiedName": "__hash__", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4354": { + "qualifiedName": "create_modified_copy", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4355": { + "qualifiedName": "parsed_content", + "sourceFileName": "/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py" + }, + "4356": { + "qualifiedName": "enqueue_links", + "sourceFileName": "/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py" + }, + "4357": { + "qualifiedName": "extract_links", + "sourceFileName": "/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py" + }, + "4358": { + "qualifiedName": "from_http_crawling_context", + "sourceFileName": "/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py" + }, + "4359": { + "qualifiedName": "from_basic_crawling_context", + "sourceFileName": "/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py" + }, + "4360": { + "qualifiedName": "get_snapshot", + "sourceFileName": "/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py" + }, + "4361": { + "qualifiedName": "http_response", + "sourceFileName": "/src/crawlee/http_clients/_base.py" + }, + "4362": { + "qualifiedName": "request", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4363": { + "qualifiedName": "session", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4364": { + "qualifiedName": "proxy_info", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4365": { + "qualifiedName": "send_request", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4366": { + "qualifiedName": "add_requests", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4367": { + "qualifiedName": "push_data", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4368": { + "qualifiedName": "use_state", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4369": { + "qualifiedName": "get_key_value_store", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4370": { + "qualifiedName": "log", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4371": { + "qualifiedName": "__hash__", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4372": { + "qualifiedName": "create_modified_copy", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4373": { + "qualifiedName": "parsed_content", + "sourceFileName": "/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py" + }, + "4374": { + "qualifiedName": "enqueue_links", + "sourceFileName": "/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py" + }, + "4375": { + "qualifiedName": "extract_links", + "sourceFileName": "/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py" + }, + "4376": { + "qualifiedName": "from_http_crawling_context", + "sourceFileName": "/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py" + }, + "4377": { + "qualifiedName": "from_basic_crawling_context", + "sourceFileName": "/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py" + }, + "4378": { + "qualifiedName": "get_snapshot", + "sourceFileName": "/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py" + }, + "4379": { + "qualifiedName": "http_response", + "sourceFileName": "/src/crawlee/http_clients/_base.py" + }, + "4380": { + "qualifiedName": "request", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4381": { + "qualifiedName": "session", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4382": { + "qualifiedName": "proxy_info", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4383": { + "qualifiedName": "send_request", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4384": { + "qualifiedName": "add_requests", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4385": { + "qualifiedName": "push_data", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4386": { + "qualifiedName": "use_state", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4387": { + "qualifiedName": "get_key_value_store", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4388": { + "qualifiedName": "log", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4389": { + "qualifiedName": "__hash__", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4390": { + "qualifiedName": "create_modified_copy", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4391": { + "qualifiedName": "current_size", + "sourceFileName": "/src/crawlee/_utils/system.py" + }, + "4392": { + "qualifiedName": "limit", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4393": { + "qualifiedName": "base_url", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4394": { + "qualifiedName": "strategy", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4395": { + "qualifiedName": "include", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4396": { + "qualifiedName": "exclude", + "sourceFileName": "/src/crawlee/_types.py" + }, + "4397": { + "qualifiedName": "model_config", + "sourceFileName": "/src/crawlee/_request.py" + }, + "4398": { + "qualifiedName": "unique_key", + "sourceFileName": "/src/crawlee/_request.py" + }, + "4399": { + "qualifiedName": "url", + "sourceFileName": "/src/crawlee/_request.py" + }, + "4400": { + "qualifiedName": "method", + "sourceFileName": "/src/crawlee/_request.py" + }, + "4401": { + "qualifiedName": "payload", + "sourceFileName": "/src/crawlee/_request.py" + }, + "4402": { + "qualifiedName": "retry_count", + "sourceFileName": "/src/crawlee/_request.py" + }, + "4403": { + "qualifiedName": "no_retry", + "sourceFileName": "/src/crawlee/_request.py" + }, + "4404": { + "qualifiedName": "loaded_url", + "sourceFileName": "/src/crawlee/_request.py" + }, + "4405": { + "qualifiedName": "handled_at", + "sourceFileName": "/src/crawlee/_request.py" + }, + "4406": { + "qualifiedName": "from_url", + "sourceFileName": "/src/crawlee/_request.py" + }, + "4407": { + "qualifiedName": "get_query_param_from_url", + "sourceFileName": "/src/crawlee/_request.py" + }, + "4408": { + "qualifiedName": "label", + "sourceFileName": "/src/crawlee/_request.py" + }, + "4409": { + "qualifiedName": "session_id", + "sourceFileName": "/src/crawlee/_request.py" + }, + "4410": { + "qualifiedName": "crawlee_data", + "sourceFileName": "/src/crawlee/_request.py" + }, + "4411": { + "qualifiedName": "crawl_depth", + "sourceFileName": "/src/crawlee/_request.py" + }, + "4412": { + "qualifiedName": "state", + "sourceFileName": "/src/crawlee/_request.py" + }, + "4413": { + "qualifiedName": "max_retries", + "sourceFileName": "/src/crawlee/_request.py" + }, + "4414": { + "qualifiedName": "session_rotation_count", + "sourceFileName": "/src/crawlee/_request.py" + }, + "4415": { + "qualifiedName": "enqueue_strategy", + "sourceFileName": "/src/crawlee/_request.py" + }, + "4416": { + "qualifiedName": "last_proxy_tier", + "sourceFileName": "/src/crawlee/_request.py" + }, + "4417": { + "qualifiedName": "forefront", + "sourceFileName": "/src/crawlee/_request.py" + }, + "4418": { + "qualifiedName": "was_already_handled", + "sourceFileName": "/src/crawlee/_request.py" + }, + "4419": { + "qualifiedName": "__init__", + "sourceFileName": "/src/crawlee/errors.py" + } + }, + "overloads": [ + { + "modifiers": [ + "async" + ], + "args": [ + { + "location": { + "filename": "REPO_ROOT_PLACEHOLDER/src/crawlee/_utils/file.py", + "lineno": 102 + }, + "name": "path", + "type": "POSITIONAL", + "datatype": "Path" + }, + { + "location": { + "filename": "REPO_ROOT_PLACEHOLDER/src/crawlee/_utils/file.py", + "lineno": 103 + }, + "name": "data", + "type": "POSITIONAL", + "datatype": "str" + }, + { + "location": { + "filename": "REPO_ROOT_PLACEHOLDER/src/crawlee/_utils/file.py", + "lineno": 105 + }, + "name": "retry_count", + "type": "KEYWORD_ONLY", + "datatype": "int", + "default_value": "0" + } + ], + "return_type": "None", + "decorations": [ + { + "location": { + "filename": "REPO_ROOT_PLACEHOLDER/src/crawlee/_utils/file.py", + "lineno": 100 + }, + "name": "overload" + } + ], + "location": { + "filename": "REPO_ROOT_PLACEHOLDER/src/crawlee/_utils/file.py", + "lineno": 101 + }, + "name": "atomic_write", + "type": "function", + "parsedDocstring": { + "text": "" + } + }, + { + "modifiers": [ + "async" + ], + "args": [ + { + "location": { + "filename": "REPO_ROOT_PLACEHOLDER/src/crawlee/_utils/file.py", + "lineno": 111 + }, + "name": "path", + "type": "POSITIONAL", + "datatype": "Path" + }, + { + "location": { + "filename": "REPO_ROOT_PLACEHOLDER/src/crawlee/_utils/file.py", + "lineno": 112 + }, + "name": "data", + "type": "POSITIONAL", + "datatype": "bytes" + }, + { + "location": { + "filename": "REPO_ROOT_PLACEHOLDER/src/crawlee/_utils/file.py", + "lineno": 114 + }, + "name": "retry_count", + "type": "KEYWORD_ONLY", + "datatype": "int", + "default_value": "0" + } + ], + "return_type": "None", + "decorations": [ + { + "location": { + "filename": "REPO_ROOT_PLACEHOLDER/src/crawlee/_utils/file.py", + "lineno": 109 + }, + "name": "overload" + } + ], + "location": { + "filename": "REPO_ROOT_PLACEHOLDER/src/crawlee/_utils/file.py", + "lineno": 110 + }, + "name": "atomic_write", + "type": "function", + "parsedDocstring": { + "text": "" + } + }, + { + "modifiers": [ + "async" + ], + "args": [ + { + "location": { + "filename": "REPO_ROOT_PLACEHOLDER/src/crawlee/storage_clients/_redis/_utils.py", + "lineno": 9 + }, + "name": "response", + "type": "POSITIONAL", + "datatype": "Awaitable[T]" + } + ], + "return_type": "T", + "decorations": [ + { + "location": { + "filename": "REPO_ROOT_PLACEHOLDER/src/crawlee/storage_clients/_redis/_utils.py", + "lineno": 8 + }, + "name": "overload" + } + ], + "location": { + "filename": "REPO_ROOT_PLACEHOLDER/src/crawlee/storage_clients/_redis/_utils.py", + "lineno": 9 + }, + "name": "await_redis_response", + "type": "function", + "parsedDocstring": { + "text": "" + } + }, + { + "modifiers": [ + "async" + ], + "args": [ + { + "location": { + "filename": "REPO_ROOT_PLACEHOLDER/src/crawlee/storage_clients/_redis/_utils.py", + "lineno": 11 + }, + "name": "response", + "type": "POSITIONAL", + "datatype": "T" + } + ], + "return_type": "T", + "decorations": [ + { + "location": { + "filename": "REPO_ROOT_PLACEHOLDER/src/crawlee/storage_clients/_redis/_utils.py", + "lineno": 10 + }, + "name": "overload" + } + ], + "location": { + "filename": "REPO_ROOT_PLACEHOLDER/src/crawlee/storage_clients/_redis/_utils.py", + "lineno": 11 + }, + "name": "await_redis_response", + "type": "function", + "parsedDocstring": { + "text": "" + } + } + ] +} \ No newline at end of file diff --git a/website/versioned_docs/version-1.6/changelog.md b/website/versioned_docs/version-1.6/changelog.md new file mode 100644 index 0000000000..c6ea47314e --- /dev/null +++ b/website/versioned_docs/version-1.6/changelog.md @@ -0,0 +1,822 @@ +# Changelog + +All notable changes to this project will be documented in this file. + +<!-- git-cliff-unreleased-start --> +## 1.6.1 - **not yet released** + +### ๐Ÿ› Bug Fixes + +- Handle invalid URLs in `RequestList` ([#1803](https://github.com/apify/crawlee-python/pull/1803)) ([0b2e3fc](https://github.com/apify/crawlee-python/commit/0b2e3fc5cbca371131b54085e052a6cda6361b0f)) by [@Mantisus](https://github.com/Mantisus), closes [#1802](https://github.com/apify/crawlee-python/issues/1802) + + +<!-- git-cliff-unreleased-end --> +## [1.6.0](https://github.com/apify/crawlee-python/releases/tag/v1.6.0) (2026-03-20) + +### ๐Ÿš€ Features + +- Allow non-href links extract & enqueue ([#1781](https://github.com/apify/crawlee-python/pull/1781)) ([6db365d](https://github.com/apify/crawlee-python/commit/6db365d1625206d8d691256c9cd4b44a821238bb)) by [@kozlice](https://github.com/kozlice) +- Add `post_navigation_hooks` to crawlers ([#1795](https://github.com/apify/crawlee-python/pull/1795)) ([38ceda6](https://github.com/apify/crawlee-python/commit/38ceda635a18cb2f14efc7c8e8b67f3adb7e53fd)) by [@Mantisus](https://github.com/Mantisus) +- Add page lifecycle hooks to `BrowserPool` ([#1791](https://github.com/apify/crawlee-python/pull/1791)) ([6f2ac13](https://github.com/apify/crawlee-python/commit/6f2ac13fea4cfa8a65e6e41430d3e8d28cc3a787)) by [@Mantisus](https://github.com/Mantisus) +- Expose `BrowserType` and `CrawleePage` ([#1798](https://github.com/apify/crawlee-python/pull/1798)) ([b50b9f2](https://github.com/apify/crawlee-python/commit/b50b9f2a8396dcee2bd7eaf76c94d24912c2bc5f)) by [@Mantisus](https://github.com/Mantisus) +- Expose `use_state` in `BasicCrawler` ([#1799](https://github.com/apify/crawlee-python/pull/1799)) ([d121873](https://github.com/apify/crawlee-python/commit/d121873a7f5902b911dd04b4aa9eaf75a8449323)) by [@Mantisus](https://github.com/Mantisus) + +### ๐Ÿ› Bug Fixes + +- **redis:** Do not remove handled request data from request queue ([#1787](https://github.com/apify/crawlee-python/pull/1787)) ([3008c61](https://github.com/apify/crawlee-python/commit/3008c61dcbe07ccdf3c43f198b37582cc1356c9a)) by [@kozlice](https://github.com/kozlice) +- **redis:** Update actual `Request` state in request queue Redis storage client ([#1789](https://github.com/apify/crawlee-python/pull/1789)) ([787231c](https://github.com/apify/crawlee-python/commit/787231cebeb863ee2b4395964a79a37053dbec01)) by [@Mantisus](https://github.com/Mantisus) + + +## [1.5.0](https://github.com/apify/crawlee-python/releases/tag/v1.5.0) (2026-03-06) + +### ๐Ÿš€ Features + +- Use specialized Playwright docker images in templates ([#1757](https://github.com/apify/crawlee-python/pull/1757)) ([747c0cf](https://github.com/apify/crawlee-python/commit/747c0cf4a82296a2e3ea5cac5ef4c9578ea62a0c)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1756](https://github.com/apify/crawlee-python/issues/1756) +- Add `discover_valid_sitemaps` utility ([#1777](https://github.com/apify/crawlee-python/pull/1777)) ([872447b](https://github.com/apify/crawlee-python/commit/872447b60bbdb3926068064a971492807b1bdfbb)) by [@Mantisus](https://github.com/Mantisus), closes [#1740](https://github.com/apify/crawlee-python/issues/1740) + +### ๐Ÿ› Bug Fixes + +- Prevent list modification during iteration in BrowserPool ([#1703](https://github.com/apify/crawlee-python/pull/1703)) ([70309d9](https://github.com/apify/crawlee-python/commit/70309d9bf568d268a26b3ba6392be2b6ff284c65)) by [@vdusek](https://github.com/vdusek) +- Fix ` max_requests_per_crawl` excluding failed requests ([#1766](https://github.com/apify/crawlee-python/pull/1766)) ([d6bb0b4](https://github.com/apify/crawlee-python/commit/d6bb0b4a9dc5dd6668d076fbfa1b5e748deaee0d)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1765](https://github.com/apify/crawlee-python/issues/1765) +- **playwright:** Dispose of `APIResponse` body for `send_request` ([#1771](https://github.com/apify/crawlee-python/pull/1771)) ([29d301b](https://github.com/apify/crawlee-python/commit/29d301bf9d7795f2fbaddb99235a7157b880f60c)) by [@kozlice](https://github.com/kozlice) +- Return `None` from `add_request` when storage client fails to enqueue request ([#1775](https://github.com/apify/crawlee-python/pull/1775)) ([944753a](https://github.com/apify/crawlee-python/commit/944753a71956c30f3ce0896ffa24be7de5348933)) by [@Mantisus](https://github.com/Mantisus) +- Re-use pre-existing browser context in `PlaywrightBrowserController` ([#1778](https://github.com/apify/crawlee-python/pull/1778)) ([4487543](https://github.com/apify/crawlee-python/commit/44875433df83d433aa69ada458b91df3ad569f5e)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1776](https://github.com/apify/crawlee-python/issues/1776) + + +## [1.4.0](https://github.com/apify/crawlee-python/releases/tag/v1.4.0) (2026-02-17) + +### ๐Ÿš€ Features + +- Dynamic memory snapshots ([#1715](https://github.com/apify/crawlee-python/pull/1715)) ([568a7b1](https://github.com/apify/crawlee-python/commit/568a7b186dedda19ad814ee8af3cd8e256cc4ad9)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1704](https://github.com/apify/crawlee-python/issues/1704) +- Add `MySQL` and `MariaDB` support for `SqlStorageClient` ([#1749](https://github.com/apify/crawlee-python/pull/1749)) ([202b500](https://github.com/apify/crawlee-python/commit/202b5009ea5d35ea779eb5b8db1fc575f90ca7bb)) by [@Mantisus](https://github.com/Mantisus) + +### ๐Ÿ› Bug Fixes + +- Make log levels consistent in ServiceLocator ([#1746](https://github.com/apify/crawlee-python/pull/1746)) ([4163413](https://github.com/apify/crawlee-python/commit/4163413049485b035c38efd6a4a7d41502a44cfc)) by [@janbuchar](https://github.com/janbuchar) +- Fix `PlaywrightCrawler` unintentionally setting the global configuration ([#1747](https://github.com/apify/crawlee-python/pull/1747)) ([fa58438](https://github.com/apify/crawlee-python/commit/fa58438026eb72a6002c8d494725bf4e48b4407e)) by [@Pijukatel](https://github.com/Pijukatel) +- Fix `Snapshotter` handling of out of order samples ([#1735](https://github.com/apify/crawlee-python/pull/1735)) ([387c712](https://github.com/apify/crawlee-python/commit/387c712306055d901b1c0df4a9666967f039aefd)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1734](https://github.com/apify/crawlee-python/issues/1734) + +### โšก Performance + +- Optimize metadata records processing in `SqlStorageClient` ([#1551](https://github.com/apify/crawlee-python/pull/1551)) ([df1347a](https://github.com/apify/crawlee-python/commit/df1347aacf05c05980000d15b36b65996119ea86)) by [@Mantisus](https://github.com/Mantisus), closes [#1533](https://github.com/apify/crawlee-python/issues/1533) + + +## [1.3.2](https://github.com/apify/crawlee-python/releases/tag/v1.3.2) (2026-02-09) + +### ๐Ÿ› Bug Fixes + +- Use `max()` instead of `min()` for `request_max_duration` statistic ([#1701](https://github.com/apify/crawlee-python/pull/1701)) ([85c4335](https://github.com/apify/crawlee-python/commit/85c43351a05ada1369b720061f6f1a7e158340b6)) by [@vdusek](https://github.com/vdusek) +- Prevent mutation of default URL patterns list in `block_requests` ([#1702](https://github.com/apify/crawlee-python/pull/1702)) ([fcf9adb](https://github.com/apify/crawlee-python/commit/fcf9adb6a0cfeaa87ca482372d4e066584eb28d6)) by [@vdusek](https://github.com/vdusek) +- Keep None values for `user_data` in `Request` ([#1707](https://github.com/apify/crawlee-python/pull/1707)) ([3c575bc](https://github.com/apify/crawlee-python/commit/3c575bc2b0f1c89c99d134ad3a3fa7455ccc6910)) by [@Mantisus](https://github.com/Mantisus), closes [#1706](https://github.com/apify/crawlee-python/issues/1706) +- Respect `max_open_pages_per_browser` limit for `PlaywrightBrowserController` on concurrent `new_page` calls ([#1712](https://github.com/apify/crawlee-python/pull/1712)) ([2e5534b](https://github.com/apify/crawlee-python/commit/2e5534b98913d5cbd6b721b2423d063772024417)) by [@Mantisus](https://github.com/Mantisus) + + +## [1.3.1](https://github.com/apify/crawlee-python/releases/tag/v1.3.1) (2026-01-30) + +### ๐Ÿ› Bug Fixes + +- Reset all counter in metadata with `purge` for `RequestQueue` ([#1686](https://github.com/apify/crawlee-python/pull/1686)) ([ee09260](https://github.com/apify/crawlee-python/commit/ee0926084589f1b6e15840b6185ec5433be3b72f)) by [@Mantisus](https://github.com/Mantisus), closes [#1682](https://github.com/apify/crawlee-python/issues/1682) +- Set default `http3=False` for `ImpitHttpClient` ([#1685](https://github.com/apify/crawlee-python/pull/1685)) ([3f390f6](https://github.com/apify/crawlee-python/commit/3f390f677540a3905038d7db6a6d1efad32fd045)) by [@Mantisus](https://github.com/Mantisus), closes [#1683](https://github.com/apify/crawlee-python/issues/1683) +- Prevent get_request from permanently blocking requests ([#1684](https://github.com/apify/crawlee-python/pull/1684)) ([da416f9](https://github.com/apify/crawlee-python/commit/da416f98fb453904d62e7d29d8f24611ffb3ba8d)) by [@Mirza-Samad-Ahmed-Baig](https://github.com/Mirza-Samad-Ahmed-Baig) +- Do not share state between different crawlers unless requested ([#1669](https://github.com/apify/crawlee-python/pull/1669)) ([64c246b](https://github.com/apify/crawlee-python/commit/64c246bedea14f86e607d23adc5bec644c578364)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1627](https://github.com/apify/crawlee-python/issues/1627) + + +## [1.3.0](https://github.com/apify/crawlee-python/releases/tag/v1.3.0) (2026-01-20) + +### ๐Ÿš€ Features + +- Expose `AdaptivePlaywrightCrawlerStatisticState` for `AdaptivePlaywrightCrawler` ([#1635](https://github.com/apify/crawlee-python/pull/1635)) ([1bb4bcb](https://github.com/apify/crawlee-python/commit/1bb4bcb4ccbec347ad9c14f70e9e946d48e3c38e)) by [@Mantisus](https://github.com/Mantisus) + +### ๐Ÿ› Bug Fixes + +- Prevent race condition in concurrent storage creation ([#1626](https://github.com/apify/crawlee-python/pull/1626)) ([7f17a43](https://github.com/apify/crawlee-python/commit/7f17a4347d5884962767e757a92ec173688fed7b)) by [@Mantisus](https://github.com/Mantisus), closes [#1621](https://github.com/apify/crawlee-python/issues/1621) +- Create correct statistics for `AdaptivePlaywrightCrawler` on initialization with a custom parser ([#1637](https://github.com/apify/crawlee-python/pull/1637)) ([bff7260](https://github.com/apify/crawlee-python/commit/bff726055dd0d7e07a2c546b15cbee22abd85960)) by [@Mantisus](https://github.com/Mantisus), closes [#1630](https://github.com/apify/crawlee-python/issues/1630) +- Fix adding extra link for `EnqueueLinksFunction` with `limit` ([#1674](https://github.com/apify/crawlee-python/pull/1674)) ([71d7867](https://github.com/apify/crawlee-python/commit/71d7867b14f7f07cac06899f5da006091af4a954)) by [@Mantisus](https://github.com/Mantisus), closes [#1673](https://github.com/apify/crawlee-python/issues/1673) + + +## [1.2.1](https://github.com/apify/crawlee-python/releases/tag/v1.2.1) (2025-12-16) + +### ๐Ÿ› Bug Fixes + +- Fix short error summary ([#1605](https://github.com/apify/crawlee-python/pull/1605)) ([b751208](https://github.com/apify/crawlee-python/commit/b751208d9a56e9d923e4559baeba35e2eede0450)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1602](https://github.com/apify/crawlee-python/issues/1602) +- Freeze core `Request` fields ([#1603](https://github.com/apify/crawlee-python/pull/1603)) ([ae6d86b](https://github.com/apify/crawlee-python/commit/ae6d86b8c82900116032596201d94cd7875aaadc)) by [@Mantisus](https://github.com/Mantisus) +- Respect `enqueue_strategy` after redirects in `enqueue_links` ([#1607](https://github.com/apify/crawlee-python/pull/1607)) ([700df91](https://github.com/apify/crawlee-python/commit/700df91bc9be1299388030a3e48e4dbc6f5b85a0)) by [@Mantisus](https://github.com/Mantisus), closes [#1606](https://github.com/apify/crawlee-python/issues/1606) +- Protect `Request` from partial mutations on request handler failure ([#1585](https://github.com/apify/crawlee-python/pull/1585)) ([a69caf8](https://github.com/apify/crawlee-python/commit/a69caf87edecc755287c53c8cc0ca4725af5d411)) by [@Mantisus](https://github.com/Mantisus), closes [#1514](https://github.com/apify/crawlee-python/issues/1514) + + + +## [1.2.0](https://github.com/apify/crawlee-python/releases/tag/v1.2.0) (2025-12-08) + +### ๐Ÿš€ Features + +- Add additional kwargs to Crawler's export_data ([#1597](https://github.com/apify/crawlee-python/pull/1597)) ([5977f37](https://github.com/apify/crawlee-python/commit/5977f376b93a7c0d4dd53f0d331a4b04fedba2c6)) by [@vdusek](https://github.com/vdusek), closes [#526](https://github.com/apify/crawlee-python/issues/526) +- Add `goto_options` for `PlaywrightCrawler` ([#1599](https://github.com/apify/crawlee-python/pull/1599)) ([0b82f3b](https://github.com/apify/crawlee-python/commit/0b82f3b6fb175223ea2aa5b348afcd5fdb767972)) by [@Mantisus](https://github.com/Mantisus), closes [#1576](https://github.com/apify/crawlee-python/issues/1576) + +### ๐Ÿ› Bug Fixes + +- Only apply requestHandlerTimeout to request handler ([#1474](https://github.com/apify/crawlee-python/pull/1474)) ([0dfb6c2](https://github.com/apify/crawlee-python/commit/0dfb6c2a13b6650736245fa39b3fbff397644df7)) by [@janbuchar](https://github.com/janbuchar) +- Handle the case when `error_handler` returns `Request` ([#1595](https://github.com/apify/crawlee-python/pull/1595)) ([8a961a2](https://github.com/apify/crawlee-python/commit/8a961a2b07d0d33a7302dbb13c17f3d90999d390)) by [@Mantisus](https://github.com/Mantisus) +- Align `Request.state` transitions with `Request` lifecycle ([#1601](https://github.com/apify/crawlee-python/pull/1601)) ([383225f](https://github.com/apify/crawlee-python/commit/383225f9f055d95ffb1302b8cf96f42ec264f1fc)) by [@Mantisus](https://github.com/Mantisus) + + +## [1.1.1](https://github.com/apify/crawlee-python/releases/tag/v1.1.1) (2025-12-02) + +### ๐Ÿ› Bug Fixes + +- Unify separators in `unique_key` construction ([#1569](https://github.com/apify/crawlee-python/pull/1569)) ([af46a37](https://github.com/apify/crawlee-python/commit/af46a3733b059a8052489296e172f005def953f7)) by [@vdusek](https://github.com/vdusek), closes [#1512](https://github.com/apify/crawlee-python/issues/1512) +- Fix `same-domain` strategy ignoring public suffix ([#1572](https://github.com/apify/crawlee-python/pull/1572)) ([3d018b2](https://github.com/apify/crawlee-python/commit/3d018b21a28a4bee493829783057188d6106a69b)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1571](https://github.com/apify/crawlee-python/issues/1571) +- Make context helpers work in `FailedRequestHandler` and `ErrorHandler` ([#1570](https://github.com/apify/crawlee-python/pull/1570)) ([b830019](https://github.com/apify/crawlee-python/commit/b830019350830ac33075316061659e2854f7f4a5)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1532](https://github.com/apify/crawlee-python/issues/1532) +- Fix non-ASCII character corruption in `FileSystemStorageClient` on systems without UTF-8 default encoding ([#1580](https://github.com/apify/crawlee-python/pull/1580)) ([f179f86](https://github.com/apify/crawlee-python/commit/f179f8671b0b6af9264450e4fef7e49d1cecd2bd)) by [@Mantisus](https://github.com/Mantisus), closes [#1579](https://github.com/apify/crawlee-python/issues/1579) +- Respect `<base>` when enqueuing ([#1590](https://github.com/apify/crawlee-python/pull/1590)) ([de517a1](https://github.com/apify/crawlee-python/commit/de517a1629cc29b20568143eb64018f216d4ba33)) by [@Mantisus](https://github.com/Mantisus), closes [#1589](https://github.com/apify/crawlee-python/issues/1589) + + +## [1.1.0](https://github.com/apify/crawlee-python/releases/tag/v1.1.0) (2025-11-18) + +### ๐Ÿš€ Features + +- Add `chrome` `BrowserType` for `PlaywrightCrawler` to use the Chrome browser ([#1487](https://github.com/apify/crawlee-python/pull/1487)) ([b06937b](https://github.com/apify/crawlee-python/commit/b06937bbc3afe3c936b554bfc503365c1b2c526b)) by [@Mantisus](https://github.com/Mantisus), closes [#1071](https://github.com/apify/crawlee-python/issues/1071) +- Add `RedisStorageClient` based on Redis v8.0+ ([#1406](https://github.com/apify/crawlee-python/pull/1406)) ([d08d13d](https://github.com/apify/crawlee-python/commit/d08d13d39203c24ab61fe254b0956d6744db3b5f)) by [@Mantisus](https://github.com/Mantisus) +- Add support for Python 3.14 ([#1553](https://github.com/apify/crawlee-python/pull/1553)) ([89e9130](https://github.com/apify/crawlee-python/commit/89e9130cabee0fbc974b29c26483b7fa0edf627c)) by [@Mantisus](https://github.com/Mantisus) +- Add `transform_request_function` parameter for `SitemapRequestLoader` ([#1525](https://github.com/apify/crawlee-python/pull/1525)) ([dc90127](https://github.com/apify/crawlee-python/commit/dc901271849b239ba2a947e8ebff8e1815e8c4fb)) by [@Mantisus](https://github.com/Mantisus) + +### ๐Ÿ› Bug Fixes + +- Improve indexing of the `request_queue_records` table for `SqlRequestQueueClient` ([#1527](https://github.com/apify/crawlee-python/pull/1527)) ([6509534](https://github.com/apify/crawlee-python/commit/65095346a9d8b703b10c91e0510154c3c48a4176)) by [@Mantisus](https://github.com/Mantisus), closes [#1526](https://github.com/apify/crawlee-python/issues/1526) +- Improve error handling for `RobotsTxtFile.load` ([#1524](https://github.com/apify/crawlee-python/pull/1524)) ([596a311](https://github.com/apify/crawlee-python/commit/596a31184914a254b3e7a81fd2f48ea8eda7db49)) by [@Mantisus](https://github.com/Mantisus) +- Fix `crawler_runtime` not being updated during run and only in the end ([#1540](https://github.com/apify/crawlee-python/pull/1540)) ([0d6c3f6](https://github.com/apify/crawlee-python/commit/0d6c3f6d3337ddb6cab4873747c28cf95605d550)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1541](https://github.com/apify/crawlee-python/issues/1541) +- Ensure persist state event emission when exiting `EventManager` context ([#1562](https://github.com/apify/crawlee-python/pull/1562)) ([6a44f17](https://github.com/apify/crawlee-python/commit/6a44f172600cbcacebab899082d6efc9105c4e03)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1560](https://github.com/apify/crawlee-python/issues/1560) + + +## [1.0.4](https://github.com/apify/crawlee-python/releases/tag/v1.0.4) (2025-10-24) + +### ๐Ÿ› Bug Fixes + +- Respect `enqueue_strategy` in `enqueue_links` ([#1505](https://github.com/apify/crawlee-python/pull/1505)) ([6ee04bc](https://github.com/apify/crawlee-python/commit/6ee04bc08c50a70f2e956a79d4ce5072a726c3a8)) by [@Mantisus](https://github.com/Mantisus), closes [#1504](https://github.com/apify/crawlee-python/issues/1504) +- Exclude incorrect links before checking `robots.txt` ([#1502](https://github.com/apify/crawlee-python/pull/1502)) ([3273da5](https://github.com/apify/crawlee-python/commit/3273da5fee62ec9254666b376f382474c3532a56)) by [@Mantisus](https://github.com/Mantisus), closes [#1499](https://github.com/apify/crawlee-python/issues/1499) +- Resolve compatibility issue between `SqlStorageClient` and `AdaptivePlaywrightCrawler` ([#1496](https://github.com/apify/crawlee-python/pull/1496)) ([ce172c4](https://github.com/apify/crawlee-python/commit/ce172c425a8643a1d4c919db4f5e5a6e47e91deb)) by [@Mantisus](https://github.com/Mantisus), closes [#1495](https://github.com/apify/crawlee-python/issues/1495) +- Fix `BasicCrawler` statistics persistence ([#1490](https://github.com/apify/crawlee-python/pull/1490)) ([1eb1c19](https://github.com/apify/crawlee-python/commit/1eb1c19aa6f9dda4a0e3f7eda23f77a554f95076)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1501](https://github.com/apify/crawlee-python/issues/1501) +- Save context state in result for `AdaptivePlaywrightCrawler` after isolated processing in `SubCrawler` ([#1488](https://github.com/apify/crawlee-python/pull/1488)) ([62b7c70](https://github.com/apify/crawlee-python/commit/62b7c70b54085fc65a660062028014f4502beba9)) by [@Mantisus](https://github.com/Mantisus), closes [#1483](https://github.com/apify/crawlee-python/issues/1483) + + +## [1.0.3](https://github.com/apify/crawlee-python/releases/tag/v1.0.3) (2025-10-17) + +### ๐Ÿ› Bug Fixes + +- Add support for Pydantic v2.12 ([#1471](https://github.com/apify/crawlee-python/pull/1471)) ([35c1108](https://github.com/apify/crawlee-python/commit/35c110878c2f445a2866be2522ea8703e9b371dd)) by [@Mantisus](https://github.com/Mantisus), closes [#1464](https://github.com/apify/crawlee-python/issues/1464) +- Fix database version warning message ([#1485](https://github.com/apify/crawlee-python/pull/1485)) ([18a545e](https://github.com/apify/crawlee-python/commit/18a545ee8add92e844acd0068f9cb8580a82e1c9)) by [@Mantisus](https://github.com/Mantisus) +- Fix `reclaim_request` in `SqlRequestQueueClient` to correctly update the request state ([#1486](https://github.com/apify/crawlee-python/pull/1486)) ([1502469](https://github.com/apify/crawlee-python/commit/150246957f8f7f1ceb77bb77e3a02a903c50cae1)) by [@Mantisus](https://github.com/Mantisus), closes [#1484](https://github.com/apify/crawlee-python/issues/1484) +- Fix `KeyValueStore.auto_saved_value` failing in some scenarios ([#1438](https://github.com/apify/crawlee-python/pull/1438)) ([b35dee7](https://github.com/apify/crawlee-python/commit/b35dee78180e57161b826641d45a61b8d8f6ef51)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1354](https://github.com/apify/crawlee-python/issues/1354) + + +## [1.0.2](https://github.com/apify/crawlee-python/releases/tag/v1.0.2) (2025-10-08) + +### ๐Ÿ› Bug Fixes + +- Use Self type in the open() method of storage clients ([#1462](https://github.com/apify/crawlee-python/pull/1462)) ([4ec6f6c](https://github.com/apify/crawlee-python/commit/4ec6f6c08f81632197f602ff99151338b3eba6e7)) by [@janbuchar](https://github.com/janbuchar) +- Add storages name validation ([#1457](https://github.com/apify/crawlee-python/pull/1457)) ([84de11a](https://github.com/apify/crawlee-python/commit/84de11a3a603503076f5b7df487c9abab68a9015)) by [@Mantisus](https://github.com/Mantisus), closes [#1434](https://github.com/apify/crawlee-python/issues/1434) +- Pin pydantic version to <2.12.0 to avoid compatibility issues ([#1467](https://github.com/apify/crawlee-python/pull/1467)) ([f11b86f](https://github.com/apify/crawlee-python/commit/f11b86f7ed57f98e83dc1b52f15f2017a919bf59)) by [@vdusek](https://github.com/vdusek) + + +## [1.0.1](https://github.com/apify/crawlee-python/releases/tag/v1.0.1) (2025-10-06) + +### ๐Ÿ› Bug Fixes + +- Fix memory leak in `PlaywrightCrawler` on browser context creation ([#1446](https://github.com/apify/crawlee-python/pull/1446)) ([bb181e5](https://github.com/apify/crawlee-python/commit/bb181e58d8070fba38e62d6e57fe981a00e5f035)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1443](https://github.com/apify/crawlee-python/issues/1443) +- Update templates to handle optional httpx client ([#1440](https://github.com/apify/crawlee-python/pull/1440)) ([c087efd](https://github.com/apify/crawlee-python/commit/c087efd39baedf46ca3e5cae1ddc1acd6396e6c1)) by [@Pijukatel](https://github.com/Pijukatel) + + +## [1.0.0](https://github.com/apify/crawlee-python/releases/tag/v1.0.0) (2025-09-29) + +- Check out the [Release blog post](https://crawlee.dev/blog/crawlee-for-python-v1) for more details. +- Check out the [Upgrading guide](https://crawlee.dev/python/docs/upgrading/upgrading-to-v1) to ensure a smooth update. + +### ๐Ÿš€ Features + +- Add utility for load and parse Sitemap and `SitemapRequestLoader` ([#1169](https://github.com/apify/crawlee-python/pull/1169)) ([66599f8](https://github.com/apify/crawlee-python/commit/66599f8d085f3a8622e130019b6fdce2325737de)) by [@Mantisus](https://github.com/Mantisus), closes [#1161](https://github.com/apify/crawlee-python/issues/1161) +- Add periodic status logging and `status_message_callback` parameter for customization ([#1265](https://github.com/apify/crawlee-python/pull/1265)) ([b992fb2](https://github.com/apify/crawlee-python/commit/b992fb2a457dedd20fc3014d7a4a8afe14602342)) by [@Mantisus](https://github.com/Mantisus), closes [#96](https://github.com/apify/crawlee-python/issues/96) +- Add crawlee-cli option to skip project installation ([#1294](https://github.com/apify/crawlee-python/pull/1294)) ([4d5aef0](https://github.com/apify/crawlee-python/commit/4d5aef05613d10c1442fe449d1cf0f63392c98e3)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1122](https://github.com/apify/crawlee-python/issues/1122) +- Improve `Crawlee` CLI help text ([#1297](https://github.com/apify/crawlee-python/pull/1297)) ([afbe10f](https://github.com/apify/crawlee-python/commit/afbe10f15d93353f5bc551bf9f193414179d0dd7)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1295](https://github.com/apify/crawlee-python/issues/1295) +- Add basic `OpenTelemetry` instrumentation ([#1255](https://github.com/apify/crawlee-python/pull/1255)) ([a92d8b3](https://github.com/apify/crawlee-python/commit/a92d8b3f843ee795bba7e14710bb1faa1fdbf292)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1254](https://github.com/apify/crawlee-python/issues/1254) +- Add `ImpitHttpClient` http-client client using the `impit` library ([#1151](https://github.com/apify/crawlee-python/pull/1151)) ([0d0d268](https://github.com/apify/crawlee-python/commit/0d0d2681a4379c0e7ba54c49c86dabfef641610f)) by [@Mantisus](https://github.com/Mantisus) +- Prevent overloading system memory when running locally ([#1270](https://github.com/apify/crawlee-python/pull/1270)) ([30de3bd](https://github.com/apify/crawlee-python/commit/30de3bd7722cbc34db9fc582b4bda7dc2dfa90ff)) by [@janbuchar](https://github.com/janbuchar), closes [#1232](https://github.com/apify/crawlee-python/issues/1232) +- Expose `PlaywrightPersistentBrowser` class ([#1314](https://github.com/apify/crawlee-python/pull/1314)) ([b5fa955](https://github.com/apify/crawlee-python/commit/b5fa95508d7c099ff3a342577f338439283a975f)) by [@Mantisus](https://github.com/Mantisus) +- Add `impit` option for Crawlee CLI ([#1312](https://github.com/apify/crawlee-python/pull/1312)) ([508d7ce](https://github.com/apify/crawlee-python/commit/508d7ce4d998f37ab2adcf9c057c3c635a69f863)) by [@Mantisus](https://github.com/Mantisus) +- Persist RequestList state ([#1274](https://github.com/apify/crawlee-python/pull/1274)) ([cc68014](https://github.com/apify/crawlee-python/commit/cc680147ba3cc8b35b9da70274e53e6f5dd92434)) by [@janbuchar](https://github.com/janbuchar), closes [#99](https://github.com/apify/crawlee-python/issues/99) +- Persist `DefaultRenderingTypePredictor` state ([#1340](https://github.com/apify/crawlee-python/pull/1340)) ([fad4c25](https://github.com/apify/crawlee-python/commit/fad4c25fc712915c4a45b24e3290b6f5dbd8a683)) by [@Mantisus](https://github.com/Mantisus), closes [#1272](https://github.com/apify/crawlee-python/issues/1272) +- Persist the `SitemapRequestLoader` state ([#1347](https://github.com/apify/crawlee-python/pull/1347)) ([27ef9ad](https://github.com/apify/crawlee-python/commit/27ef9ad194552ea9f1321d91a7a52054be9a8a51)) by [@Mantisus](https://github.com/Mantisus), closes [#1269](https://github.com/apify/crawlee-python/issues/1269) +- Add support for NDU storages ([#1401](https://github.com/apify/crawlee-python/pull/1401)) ([5dbd212](https://github.com/apify/crawlee-python/commit/5dbd212663e7abc37535713f4c6e3a5bbf30a12e)) by [@vdusek](https://github.com/vdusek), closes [#1175](https://github.com/apify/crawlee-python/issues/1175) +- Add RQ id, name, alias args to `add_requests` and `enqueue_links` methods ([#1413](https://github.com/apify/crawlee-python/pull/1413)) ([1cae2bc](https://github.com/apify/crawlee-python/commit/1cae2bca0b1508fcb3cb419dc239caf33e20a7ef)) by [@Mantisus](https://github.com/Mantisus), closes [#1402](https://github.com/apify/crawlee-python/issues/1402) +- Add `SqlStorageClient` based on `sqlalchemy` v2+ ([#1339](https://github.com/apify/crawlee-python/pull/1339)) ([07c75a0](https://github.com/apify/crawlee-python/commit/07c75a078b443b58bfaaeb72eb2aa1439458dc47)) by [@Mantisus](https://github.com/Mantisus), closes [#307](https://github.com/apify/crawlee-python/issues/307) + +### ๐Ÿ› Bug Fixes + +- Fix memory estimation not working on MacOS ([#1330](https://github.com/apify/crawlee-python/pull/1330)) ([ab020eb](https://github.com/apify/crawlee-python/commit/ab020eb821a75723225b652d64babd84c368183f)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1329](https://github.com/apify/crawlee-python/issues/1329) +- Fix retry count to not count the original request ([#1328](https://github.com/apify/crawlee-python/pull/1328)) ([74fa1d9](https://github.com/apify/crawlee-python/commit/74fa1d936cb3c29cf62d87862a96b4266694af2f)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1326](https://github.com/apify/crawlee-python/issues/1326) +- [**breaking**] Remove unused "stats" field from RequestQueueMetadata ([#1331](https://github.com/apify/crawlee-python/pull/1331)) ([0a63bef](https://github.com/apify/crawlee-python/commit/0a63bef514b0bdcd3d6f208b386f706d0fe561e6)) by [@vdusek](https://github.com/vdusek) +- Ignore unknown parameters passed in cookies ([#1336](https://github.com/apify/crawlee-python/pull/1336)) ([50d3ef7](https://github.com/apify/crawlee-python/commit/50d3ef7540551383d26d40f3404b435bde35b47d)) by [@Mantisus](https://github.com/Mantisus), closes [#1333](https://github.com/apify/crawlee-python/issues/1333) +- Fix `timeout` for `stream` method in `ImpitHttpClient` ([#1352](https://github.com/apify/crawlee-python/pull/1352)) ([54b693b](https://github.com/apify/crawlee-python/commit/54b693b838f135a596e1e9493b565bc558b19a3a)) by [@Mantisus](https://github.com/Mantisus) +- Include reason in the session rotation warning logs ([#1363](https://github.com/apify/crawlee-python/pull/1363)) ([d6d7a45](https://github.com/apify/crawlee-python/commit/d6d7a45dd64a906419d9552c45062d726cbb1a0f)) by [@vdusek](https://github.com/vdusek), closes [#1318](https://github.com/apify/crawlee-python/issues/1318) +- Improve crawler statistics logging ([#1364](https://github.com/apify/crawlee-python/pull/1364)) ([1eb6da5](https://github.com/apify/crawlee-python/commit/1eb6da5dd85870124593dcad877284ccaed9c0ce)) by [@vdusek](https://github.com/vdusek), closes [#1317](https://github.com/apify/crawlee-python/issues/1317) +- Do not add a request that is already in progress to `MemoryRequestQueueClient` ([#1384](https://github.com/apify/crawlee-python/pull/1384)) ([3af326c](https://github.com/apify/crawlee-python/commit/3af326c9dfa8fffd56a42ca42981374613739e39)) by [@Mantisus](https://github.com/Mantisus), closes [#1383](https://github.com/apify/crawlee-python/issues/1383) +- Save `RequestQueueState` for `FileSystemRequestQueueClient` in default KVS ([#1411](https://github.com/apify/crawlee-python/pull/1411)) ([6ee60a0](https://github.com/apify/crawlee-python/commit/6ee60a08ac1f9414e1b792f4935cc3799cb5089a)) by [@Mantisus](https://github.com/Mantisus), closes [#1410](https://github.com/apify/crawlee-python/issues/1410) +- Set default desired concurrency for non-browser crawlers to 10 ([#1419](https://github.com/apify/crawlee-python/pull/1419)) ([1cc9401](https://github.com/apify/crawlee-python/commit/1cc940197600d2539bda967880d7f9d241eb8c3e)) by [@vdusek](https://github.com/vdusek) + +### ๐Ÿšœ Refactor + +- [**breaking**] Introduce new storage client system ([#1194](https://github.com/apify/crawlee-python/pull/1194)) ([de1c03f](https://github.com/apify/crawlee-python/commit/de1c03f70dbd4ae1773fd49c632b3cfcfab82c26)) by [@vdusek](https://github.com/vdusek), closes [#92](https://github.com/apify/crawlee-python/issues/92), [#147](https://github.com/apify/crawlee-python/issues/147), [#783](https://github.com/apify/crawlee-python/issues/783), [#1247](https://github.com/apify/crawlee-python/issues/1247) +- [**breaking**] Split `BrowserType` literal into two different literals based on context ([#1070](https://github.com/apify/crawlee-python/pull/1070)) ([72b5698](https://github.com/apify/crawlee-python/commit/72b5698fa0647ea02b08da5651736cc37c4c0f6a)) by [@Pijukatel](https://github.com/Pijukatel) +- [**breaking**] Change method `HttpResponse.read` from sync to async ([#1296](https://github.com/apify/crawlee-python/pull/1296)) ([83fa8a4](https://github.com/apify/crawlee-python/commit/83fa8a416b6d2d4e27c678b9bf99bd1b8799f57b)) by [@Mantisus](https://github.com/Mantisus) +- [**breaking**] Replace `HttpxHttpClient` with `ImpitHttpClient` as default HTTP client ([#1307](https://github.com/apify/crawlee-python/pull/1307)) ([c803a97](https://github.com/apify/crawlee-python/commit/c803a976776a76846866d533e3a3ee8144e248c4)) by [@Mantisus](https://github.com/Mantisus), closes [#1079](https://github.com/apify/crawlee-python/issues/1079) +- [**breaking**] Change Dataset unwind parameter to accept list of strings ([#1357](https://github.com/apify/crawlee-python/pull/1357)) ([862a203](https://github.com/apify/crawlee-python/commit/862a20398f00fe91802fe7a1ccd58b05aee053a1)) by [@vdusek](https://github.com/vdusek) +- [**breaking**] Remove `Request.id` field ([#1366](https://github.com/apify/crawlee-python/pull/1366)) ([32f3580](https://github.com/apify/crawlee-python/commit/32f3580e9775a871924ab1233085d0c549c4cd52)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1358](https://github.com/apify/crawlee-python/issues/1358) +- [**breaking**] Refactor storage creation and caching, configuration and services ([#1386](https://github.com/apify/crawlee-python/pull/1386)) ([04649bd](https://github.com/apify/crawlee-python/commit/04649bde60d46b2bc18ae4f6e3fd9667d02a9cef)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1379](https://github.com/apify/crawlee-python/issues/1379) + + + +## [0.6.12](https://github.com/apify/crawlee-python/releases/tag/v0.6.12) (2025-07-30) + +### ๐Ÿš€ Features + +- Add `retire_browser_after_page_count` parameter for `BrowserPool` ([#1266](https://github.com/apify/crawlee-python/pull/1266)) ([603aa2b](https://github.com/apify/crawlee-python/commit/603aa2b192ef4bc42d88244bd009fffdb0614c06)) by [@Mantisus](https://github.com/Mantisus) + +### ๐Ÿ› Bug Fixes + +- Use `perf_counter_ns` for request duration tracking ([#1260](https://github.com/apify/crawlee-python/pull/1260)) ([9e92f6b](https://github.com/apify/crawlee-python/commit/9e92f6b54400ce5004fbab770e2e4ac42f73148f)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1256](https://github.com/apify/crawlee-python/issues/1256) +- Fix memory estimation not working on MacOS (#1330) ([8558954](https://github.com/apify/crawlee-python/commit/8558954feeb7d5e91378186974a29851fedae9c8)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1329](https://github.com/apify/crawlee-python/issues/1329) +- Fix retry count to not count the original request (#1328) ([1aff3aa](https://github.com/apify/crawlee-python/commit/1aff3aaf0cdbe452a3731192449a445e5b2d7a63)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1326](https://github.com/apify/crawlee-python/issues/1326) +- Ignore unknown parameters passed in cookies (#1336) ([0f2610c](https://github.com/apify/crawlee-python/commit/0f2610c0ee1154dc004de60fc57fe7c9f478166a)) by [@Mantisus](https://github.com/Mantisus), closes [#1333](https://github.com/apify/crawlee-python/issues/1333) + + +## [0.6.11](https://github.com/apify/crawlee-python/releases/tag/v0.6.11) (2025-06-23) + +### ๐Ÿš€ Features + +- Add `stream` method for `HttpClient` ([#1241](https://github.com/apify/crawlee-python/pull/1241)) ([95c68b0](https://github.com/apify/crawlee-python/commit/95c68b0b2d0bf9e093c1d0ee1002625172f7a868)) by [@Mantisus](https://github.com/Mantisus) + +### ๐Ÿ› Bug Fixes + +- Fix `ClientSnapshot` overload calculation ([#1228](https://github.com/apify/crawlee-python/pull/1228)) ([a4fc1b6](https://github.com/apify/crawlee-python/commit/a4fc1b6e83143650666108c289c084ea0463b80c)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1207](https://github.com/apify/crawlee-python/issues/1207) +- Use `PSS` instead of `RSS` to estimate children process memory usage on Linux ([#1210](https://github.com/apify/crawlee-python/pull/1210)) ([436032f](https://github.com/apify/crawlee-python/commit/436032f2de5f7d7fa1016033f1bb224159a8e6bf)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1206](https://github.com/apify/crawlee-python/issues/1206) +- Do not raise an error to check 'same-domain' if there is no hostname in the url ([#1251](https://github.com/apify/crawlee-python/pull/1251)) ([a6c3aab](https://github.com/apify/crawlee-python/commit/a6c3aabf5f8341f215275077b6768a56118bc656)) by [@Mantisus](https://github.com/Mantisus) + + +## [0.6.10](https://github.com/apify/crawlee-python/releases/tag/v0.6.10) (2025-06-02) + +### ๐Ÿ› Bug Fixes + +- Allow config change on `PlaywrightCrawler` ([#1186](https://github.com/apify/crawlee-python/pull/1186)) ([f17bf31](https://github.com/apify/crawlee-python/commit/f17bf31456b702631aa7e0c26d4f07fd5eb7d1bd)) by [@mylank](https://github.com/mylank), closes [#1185](https://github.com/apify/crawlee-python/issues/1185) +- Add `payload` to `SendRequestFunction` to support `POST` request ([#1202](https://github.com/apify/crawlee-python/pull/1202)) ([e7449f2](https://github.com/apify/crawlee-python/commit/e7449f206c580cb8383a66e4c9ff5f67c5ce8409)) by [@Mantisus](https://github.com/Mantisus) +- Fix match check for specified enqueue strategy for requests with redirect ([#1199](https://github.com/apify/crawlee-python/pull/1199)) ([d84c30c](https://github.com/apify/crawlee-python/commit/d84c30cbd7c94d6525d3b6e8e86b379050454c0e)) by [@Mantisus](https://github.com/Mantisus), closes [#1198](https://github.com/apify/crawlee-python/issues/1198) +- Set `WindowsSelectorEventLoopPolicy` only for curl-impersonate template without `playwright` ([#1209](https://github.com/apify/crawlee-python/pull/1209)) ([f3b839f](https://github.com/apify/crawlee-python/commit/f3b839ffc2ccc1b889b6d5928f35f57b725e27f1)) by [@Mantisus](https://github.com/Mantisus), closes [#1204](https://github.com/apify/crawlee-python/issues/1204) +- Add support non-GET requests for `PlaywrightCrawler` ([#1208](https://github.com/apify/crawlee-python/pull/1208)) ([dbb9f44](https://github.com/apify/crawlee-python/commit/dbb9f44c71af15e1f86766fa0ba68281dd85fd9e)) by [@Mantisus](https://github.com/Mantisus), closes [#1201](https://github.com/apify/crawlee-python/issues/1201) +- Respect `EnqueueLinksKwargs` for `extract_links` function ([#1213](https://github.com/apify/crawlee-python/pull/1213)) ([c9907d6](https://github.com/apify/crawlee-python/commit/c9907d6ff4c3a4a719b279cea77694c00a5a963d)) by [@Mantisus](https://github.com/Mantisus), closes [#1212](https://github.com/apify/crawlee-python/issues/1212) + + +## [0.6.9](https://github.com/apify/crawlee-python/releases/tag/v0.6.9) (2025-05-02) + +### ๐Ÿš€ Features + +- Add an internal `HttpClient` to be used in `send_request` for `PlaywrightCrawler` using `APIRequestContext` bound to the browser context ([#1134](https://github.com/apify/crawlee-python/pull/1134)) ([e794f49](https://github.com/apify/crawlee-python/commit/e794f4985d3a018ee76d634fe2b2c735fb450272)) by [@Mantisus](https://github.com/Mantisus), closes [#928](https://github.com/apify/crawlee-python/issues/928) +- Make timeout error log cleaner ([#1170](https://github.com/apify/crawlee-python/pull/1170)) ([78ea9d2](https://github.com/apify/crawlee-python/commit/78ea9d23e0b2d73286043b68393e462f636625c9)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1158](https://github.com/apify/crawlee-python/issues/1158) +- Add `on_skipped_request` decorator, to process links skipped according to `robots.txt` rules ([#1166](https://github.com/apify/crawlee-python/pull/1166)) ([bd16f14](https://github.com/apify/crawlee-python/commit/bd16f14a834eebf485aea6b6a83f2b18bf16b504)) by [@Mantisus](https://github.com/Mantisus), closes [#1160](https://github.com/apify/crawlee-python/issues/1160) + +### ๐Ÿ› Bug Fixes + +- Fix handle error without `args` in `_get_error_message` for `ErrorTracker` ([#1181](https://github.com/apify/crawlee-python/pull/1181)) ([21944d9](https://github.com/apify/crawlee-python/commit/21944d908b8404d2ad6c182104e7a8c27be12a6e)) by [@Mantisus](https://github.com/Mantisus), closes [#1179](https://github.com/apify/crawlee-python/issues/1179) +- Temporarily add `certifi<=2025.1.31` dependency ([#1183](https://github.com/apify/crawlee-python/pull/1183)) ([25ff961](https://github.com/apify/crawlee-python/commit/25ff961990f9abc9d0673ba6573dfcf46dd6e53f)) by [@Pijukatel](https://github.com/Pijukatel) + + +## [0.6.8](https://github.com/apify/crawlee-python/releases/tag/v0.6.8) (2025-04-25) + +### ๐Ÿš€ Features + +- Handle unprocessed requests in `add_requests_batched` ([#1159](https://github.com/apify/crawlee-python/pull/1159)) ([7851175](https://github.com/apify/crawlee-python/commit/7851175304d63e455223b25853021cfbe15d68bd)) by [@Pijukatel](https://github.com/Pijukatel), closes [#456](https://github.com/apify/crawlee-python/issues/456) +- Add `respect_robots_txt_file` option ([#1162](https://github.com/apify/crawlee-python/pull/1162)) ([c23f365](https://github.com/apify/crawlee-python/commit/c23f365bfd263b4357edf82c14a7c6ff8dee45e4)) by [@Mantisus](https://github.com/Mantisus) + +### ๐Ÿ› Bug Fixes + +- Update `UnprocessedRequest` to match actual data ([#1155](https://github.com/apify/crawlee-python/pull/1155)) ([a15a1f3](https://github.com/apify/crawlee-python/commit/a15a1f3528c7cbcf78d3bda5a236bcee1d492764)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1150](https://github.com/apify/crawlee-python/issues/1150) +- Fix the order in which cookies are saved to the `SessionCookies` and the handler is executed for `PlaywrightCrawler` ([#1163](https://github.com/apify/crawlee-python/pull/1163)) ([82ff69a](https://github.com/apify/crawlee-python/commit/82ff69acd8e409f56be56dd061aae0f854ec25b4)) by [@Mantisus](https://github.com/Mantisus) +- Call `failed_request_handler` for `SessionError` when session rotation count exceeds maximum ([#1147](https://github.com/apify/crawlee-python/pull/1147)) ([b3637b6](https://github.com/apify/crawlee-python/commit/b3637b68ec7eae9de7f1b923fa2f68885da64b90)) by [@Mantisus](https://github.com/Mantisus) + + +## [0.6.7](https://github.com/apify/crawlee-python/releases/tag/v0.6.7) (2025-04-17) + +### ๐Ÿš€ Features + +- Add `ErrorSnapshotter` to `ErrorTracker` ([#1125](https://github.com/apify/crawlee-python/pull/1125)) ([9666092](https://github.com/apify/crawlee-python/commit/9666092c6a59ac4d34409038d5476e5b6fb58a26)) by [@Pijukatel](https://github.com/Pijukatel), closes [#151](https://github.com/apify/crawlee-python/issues/151) + +### ๐Ÿ› Bug Fixes + +- Improve validation errors in Crawlee CLI ([#1140](https://github.com/apify/crawlee-python/pull/1140)) ([f2d33df](https://github.com/apify/crawlee-python/commit/f2d33dff178a3d3079eb3807feb9645a25cc7a93)) by [@vdusek](https://github.com/vdusek), closes [#1138](https://github.com/apify/crawlee-python/issues/1138) +- Disable logger propagation to prevent duplicate logs ([#1156](https://github.com/apify/crawlee-python/pull/1156)) ([0b3648d](https://github.com/apify/crawlee-python/commit/0b3648d5d399f0af23520f7fb8ee635d38b512c4)) by [@vdusek](https://github.com/vdusek) + + +## [0.6.6](https://github.com/apify/crawlee-python/releases/tag/v0.6.6) (2025-04-03) + +### ๐Ÿš€ Features + +- Add `statistics_log_format` parameter to `BasicCrawler` ([#1061](https://github.com/apify/crawlee-python/pull/1061)) ([635ae4a](https://github.com/apify/crawlee-python/commit/635ae4a56c65e434783ca721f4164203f465abf0)) by [@Mantisus](https://github.com/Mantisus), closes [#700](https://github.com/apify/crawlee-python/issues/700) +- Add Session binding capability via `session_id` in `Request` ([#1086](https://github.com/apify/crawlee-python/pull/1086)) ([cda7b31](https://github.com/apify/crawlee-python/commit/cda7b314ffda3104e4fd28a5e85c9e238d8866a4)) by [@Mantisus](https://github.com/Mantisus), closes [#1076](https://github.com/apify/crawlee-python/issues/1076) +- Add `requests` argument to `EnqueueLinksFunction` ([#1024](https://github.com/apify/crawlee-python/pull/1024)) ([fc8444c](https://github.com/apify/crawlee-python/commit/fc8444c245c7607d3e378a4835d7d3355c4059be)) by [@Pijukatel](https://github.com/Pijukatel) + +### ๐Ÿ› Bug Fixes + +- Add port for `same-origin` strategy check ([#1096](https://github.com/apify/crawlee-python/pull/1096)) ([9e24598](https://github.com/apify/crawlee-python/commit/9e245987d0aab0ba9c763689f12958b5a332db46)) by [@Mantisus](https://github.com/Mantisus) +- Fix handling of loading empty `metadata` file for queue ([#1042](https://github.com/apify/crawlee-python/pull/1042)) ([b00876e](https://github.com/apify/crawlee-python/commit/b00876e8dcb30a12d3737bd31237da9daada46bb)) by [@Mantisus](https://github.com/Mantisus), closes [#1029](https://github.com/apify/crawlee-python/issues/1029) +- Update favicon ([#1114](https://github.com/apify/crawlee-python/pull/1114)) ([eba900f](https://github.com/apify/crawlee-python/commit/eba900fc1e8d918c6fc464657c53004a3e0fe668)) by [@baldasseva](https://github.com/baldasseva) +- **website:** Use correct image source ([#1115](https://github.com/apify/crawlee-python/pull/1115)) ([ee7806f](https://github.com/apify/crawlee-python/commit/ee7806fc2f9b7b590d9668cc9f86009a898a3da6)) by [@baldasseva](https://github.com/baldasseva) + + +## [0.6.5](https://github.com/apify/crawlee-python/releases/tag/v0.6.5) (2025-03-13) + +### ๐Ÿ› Bug Fixes + +- Update to `browserforge` workaround ([#1075](https://github.com/apify/crawlee-python/pull/1075)) ([2378cf8](https://github.com/apify/crawlee-python/commit/2378cf84ab1ed06473049a9ddfca2ba6f166306d)) by [@Pijukatel](https://github.com/Pijukatel) + + +## [0.6.4](https://github.com/apify/crawlee-python/releases/tag/v0.6.4) (2025-03-12) + +### ๐Ÿ› Bug Fixes + +- Add a check thread before set `add_signal_handler` ([#1068](https://github.com/apify/crawlee-python/pull/1068)) ([6983bda](https://github.com/apify/crawlee-python/commit/6983bda2dbc202b3ecbf7db62b11deee007b4b5f)) by [@Mantisus](https://github.com/Mantisus) +- Temporary workaround for `browserforge` import time code execution ([#1073](https://github.com/apify/crawlee-python/pull/1073)) ([17d914f](https://github.com/apify/crawlee-python/commit/17d914f78242078f88c07d686a567d1091255eb1)) by [@Pijukatel](https://github.com/Pijukatel) + + +## [0.6.3](https://github.com/apify/crawlee-python/releases/tag/v0.6.3) (2025-03-07) + +### ๐Ÿš€ Features + +- Add project template with `uv` package manager ([#1057](https://github.com/apify/crawlee-python/pull/1057)) ([9ec06e5](https://github.com/apify/crawlee-python/commit/9ec06e58032aa11af46ac9cd1ea7bb002a18eb13)) by [@Mantisus](https://github.com/Mantisus), closes [#1053](https://github.com/apify/crawlee-python/issues/1053) +- Use fingerprint generator in `PlaywrightCrawler` by default ([#1060](https://github.com/apify/crawlee-python/pull/1060)) ([09cec53](https://github.com/apify/crawlee-python/commit/09cec532911043623eeb475aa8552c70bd94f8b7)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1054](https://github.com/apify/crawlee-python/issues/1054) + +### ๐Ÿ› Bug Fixes + +- Update project templates for Poetry v2.x compatibility ([#1049](https://github.com/apify/crawlee-python/pull/1049)) ([96dc2f9](https://github.com/apify/crawlee-python/commit/96dc2f9b53b0a2d0f1d0c73d10e5244114e849ff)) by [@Mantisus](https://github.com/Mantisus), closes [#954](https://github.com/apify/crawlee-python/issues/954) +- Remove tmp folder for PlaywrightCrawler in non-headless mode ([#1046](https://github.com/apify/crawlee-python/pull/1046)) ([3a7f444](https://github.com/apify/crawlee-python/commit/3a7f444fb7ee9a0ab1867c8c9586b15aab1e7df2)) by [@Mantisus](https://github.com/Mantisus) + + +## [0.6.2](https://github.com/apify/crawlee-python/releases/tag/v0.6.2) (2025-03-05) + +### ๐Ÿš€ Features + +- Extend ErrorTracker with error grouping ([#1014](https://github.com/apify/crawlee-python/pull/1014)) ([561de5c](https://github.com/apify/crawlee-python/commit/561de5c6b76af386cad5ac804a22fb7af227e460)) by [@Pijukatel](https://github.com/Pijukatel) + + +## [0.6.1](https://github.com/apify/crawlee-python/releases/tag/v0.6.1) (2025-03-03) + +### ๐Ÿ› Bug Fixes + +- Add `browserforge` to mandatory dependencies ([#1044](https://github.com/apify/crawlee-python/pull/1044)) ([ddfbde8](https://github.com/apify/crawlee-python/commit/ddfbde89dd3e3cbef0f3954936f4a41c3d6df909)) by [@Pijukatel](https://github.com/Pijukatel) + + +## [0.6.0](https://github.com/apify/crawlee-python/releases/tag/v0.6.0) (2025-03-03) + +- Check out the [Release blog post](https://crawlee.dev/blog/crawlee-for-python-v06) for more details. +- Check out the [Upgrading guide](https://crawlee.dev/python/docs/upgrading/upgrading-to-v0x#upgrading-to-v06) to ensure a smooth update. + +### ๐Ÿš€ Features + +- Integrate browserforge fingerprints ([#829](https://github.com/apify/crawlee-python/pull/829)) ([2b156b4](https://github.com/apify/crawlee-python/commit/2b156b4ba688f9111195422e6058dff30eb1f782)) by [@Pijukatel](https://github.com/Pijukatel), closes [#549](https://github.com/apify/crawlee-python/issues/549) +- Add AdaptivePlaywrightCrawler ([#872](https://github.com/apify/crawlee-python/pull/872)) ([5ba70b6](https://github.com/apify/crawlee-python/commit/5ba70b6e846a908a55db461ab0c85e3946f2bc7c)) by [@Pijukatel](https://github.com/Pijukatel) +- Implement `_snapshot_client` for `Snapshotter` ([#957](https://github.com/apify/crawlee-python/pull/957)) ([ba4d384](https://github.com/apify/crawlee-python/commit/ba4d384228d030c20c580ed01fae0e78af3a9543)) by [@Mantisus](https://github.com/Mantisus), closes [#60](https://github.com/apify/crawlee-python/issues/60) +- Add adaptive context helpers ([#964](https://github.com/apify/crawlee-python/pull/964)) ([e248f17](https://github.com/apify/crawlee-python/commit/e248f17fad7b6d1fc5e23a0a1e961db66068a411)) by [@Pijukatel](https://github.com/Pijukatel), closes [#249](https://github.com/apify/crawlee-python/issues/249) +- [**breaking**] Enable additional status codes arguments to PlaywrightCrawler ([#959](https://github.com/apify/crawlee-python/pull/959)) ([87cf446](https://github.com/apify/crawlee-python/commit/87cf446a7cbaa900e28abd93d4c8a2e0d1747059)) by [@Pijukatel](https://github.com/Pijukatel), closes [#953](https://github.com/apify/crawlee-python/issues/953) +- Replace `HeaderGenerator` implementation by `browserforge` implementation ([#960](https://github.com/apify/crawlee-python/pull/960)) ([c2f8c93](https://github.com/apify/crawlee-python/commit/c2f8c93a4ad57c4ede354545bf925bf3707899c9)) by [@Pijukatel](https://github.com/Pijukatel), closes [#937](https://github.com/apify/crawlee-python/issues/937) + +### ๐Ÿ› Bug Fixes + +- Fix playwright template and dockerfile ([#972](https://github.com/apify/crawlee-python/pull/972)) ([c33b34d](https://github.com/apify/crawlee-python/commit/c33b34dd6e253b1261c700857bb5c4bbec6d5c14)) by [@janbuchar](https://github.com/janbuchar), closes [#969](https://github.com/apify/crawlee-python/issues/969) +- Fix installing dependencies via pip in project template ([#977](https://github.com/apify/crawlee-python/pull/977)) ([1e3b8eb](https://github.com/apify/crawlee-python/commit/1e3b8eb1cdb57bf2f7256e8ae5f0706b0afc3ba9)) by [@janbuchar](https://github.com/janbuchar), closes [#975](https://github.com/apify/crawlee-python/issues/975) +- Fix default migration storage ([#1018](https://github.com/apify/crawlee-python/pull/1018)) ([6a0c4d9](https://github.com/apify/crawlee-python/commit/6a0c4d94593f7e94f24eee8a97fc7bc83c4d02e1)) by [@Pijukatel](https://github.com/Pijukatel), closes [#991](https://github.com/apify/crawlee-python/issues/991) +- Fix logger name for http based loggers ([#1023](https://github.com/apify/crawlee-python/pull/1023)) ([bfb3944](https://github.com/apify/crawlee-python/commit/bfb394446351c8f3b9879a9905607f7c929f2542)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1021](https://github.com/apify/crawlee-python/issues/1021) +- Remove allow_redirects override in CurlImpersonateHttpClient ([#1017](https://github.com/apify/crawlee-python/pull/1017)) ([01d855a](https://github.com/apify/crawlee-python/commit/01d855a43389a6b4b16ec74767624fa7eb13151f)) by [@2tunnels](https://github.com/2tunnels), closes [#1016](https://github.com/apify/crawlee-python/issues/1016) +- Remove follow_redirects override in HttpxHttpClient ([#1015](https://github.com/apify/crawlee-python/pull/1015)) ([88afda3](https://github.com/apify/crawlee-python/commit/88afda33e77be84bc91ad1239740b8e661bef2a2)) by [@2tunnels](https://github.com/2tunnels), closes [#1013](https://github.com/apify/crawlee-python/issues/1013) +- Fix flaky test_common_headers_and_user_agent ([#1030](https://github.com/apify/crawlee-python/pull/1030)) ([58aa70e](https://github.com/apify/crawlee-python/commit/58aa70e9600d313b823a1376ab9b36fb416c1c4a)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1027](https://github.com/apify/crawlee-python/issues/1027) + +### ๐Ÿšœ Refactor + +- [**breaking**] Remove unused config properties ([#978](https://github.com/apify/crawlee-python/pull/978)) ([4b7fe29](https://github.com/apify/crawlee-python/commit/4b7fe2930540a5fbd753135e3ce29dc80f80c543)) by [@vdusek](https://github.com/vdusek) +- [**breaking**] Remove Base prefix from abstract class names ([#980](https://github.com/apify/crawlee-python/pull/980)) ([8ccb5d4](https://github.com/apify/crawlee-python/commit/8ccb5d41a1dae9b02088b433266ac89bd089561a)) by [@vdusek](https://github.com/vdusek) +- [**breaking**] ะกhange default `incognito context` to `persistent context` for `Playwright` ([#985](https://github.com/apify/crawlee-python/pull/985)) ([f01520d](https://github.com/apify/crawlee-python/commit/f01520d22b31af9f0f13ca162cc47e6aa9744c6d)) by [@Mantisus](https://github.com/Mantisus), closes [#721](https://github.com/apify/crawlee-python/issues/721), [#963](https://github.com/apify/crawlee-python/issues/963) +- [**breaking**] Change `Session` cookies from `dict` to `SessionCookies` with `CookieJar` ([#984](https://github.com/apify/crawlee-python/pull/984)) ([6523b3a](https://github.com/apify/crawlee-python/commit/6523b3ade0eed53b0363ddce250c557024339b5e)) by [@Mantisus](https://github.com/Mantisus), closes [#710](https://github.com/apify/crawlee-python/issues/710), [#933](https://github.com/apify/crawlee-python/issues/933) +- [**breaking**] Replace enum with literal for `EnqueueStrategy` ([#1019](https://github.com/apify/crawlee-python/pull/1019)) ([d2481ef](https://github.com/apify/crawlee-python/commit/d2481ef71d3539979c5b1129387e72b4126fe366)) by [@vdusek](https://github.com/vdusek) +- [**breaking**] Update status code handling ([#1028](https://github.com/apify/crawlee-python/pull/1028)) ([6b59471](https://github.com/apify/crawlee-python/commit/6b5947125e63abdfff481b0669398fc9a7293e55)) by [@Mantisus](https://github.com/Mantisus), closes [#830](https://github.com/apify/crawlee-python/issues/830), [#998](https://github.com/apify/crawlee-python/issues/998) +- [**breaking**] Move `cli` dependencies to optional dependencies ([#1011](https://github.com/apify/crawlee-python/pull/1011)) ([4382959](https://github.com/apify/crawlee-python/commit/43829590c6b4efd1dc9b833373f82a842a0a1a8e)) by [@Mantisus](https://github.com/Mantisus), closes [#703](https://github.com/apify/crawlee-python/issues/703), [#1010](https://github.com/apify/crawlee-python/issues/1010) + + +## [0.5.4](https://github.com/apify/crawlee-python/releases/tag/v0.5.4) (2025-02-05) + +### ๐Ÿš€ Features + +- Add support `use_incognito_pages` for `browser_launch_options` in `PlaywrightCrawler` ([#941](https://github.com/apify/crawlee-python/pull/941)) ([eae3a33](https://github.com/apify/crawlee-python/commit/eae3a33a1842ebbdac5f9c51866a4be4bcf1ae2c)) by [@Mantisus](https://github.com/Mantisus) + +### ๐Ÿ› Bug Fixes + +- Fix session management with retire ([#947](https://github.com/apify/crawlee-python/pull/947)) ([caee03f](https://github.com/apify/crawlee-python/commit/caee03fe3a43cc1d7a8d3f9e19b42df1bdb1c0aa)) by [@Mantisus](https://github.com/Mantisus) +- Fix templates - poetry-plugin-export version and camoufox template name ([#952](https://github.com/apify/crawlee-python/pull/952)) ([7addea6](https://github.com/apify/crawlee-python/commit/7addea6605359cceba208e16ec9131724bdb3e9b)) by [@Pijukatel](https://github.com/Pijukatel), closes [#951](https://github.com/apify/crawlee-python/issues/951) +- Fix convert relative link to absolute in `enqueue_links` for response with redirect ([#956](https://github.com/apify/crawlee-python/pull/956)) ([694102e](https://github.com/apify/crawlee-python/commit/694102e163bb9021a4830d2545d153f6f8f3de90)) by [@Mantisus](https://github.com/Mantisus), closes [#955](https://github.com/apify/crawlee-python/issues/955) +- Fix `CurlImpersonateHttpClient` cookies handler ([#946](https://github.com/apify/crawlee-python/pull/946)) ([ed415c4](https://github.com/apify/crawlee-python/commit/ed415c433da2a40b0ee62534f0730d0737e991b8)) by [@Mantisus](https://github.com/Mantisus) + + +## [0.5.3](https://github.com/apify/crawlee-python/releases/tag/v0.5.3) (2025-01-31) + +### ๐Ÿš€ Features + +- Add keep_alive flag to `crawler.__init__` ([#921](https://github.com/apify/crawlee-python/pull/921)) ([7a82d0c](https://github.com/apify/crawlee-python/commit/7a82d0cbdbe6c8739d4bf6a9b014e31f07e5a520)) by [@Pijukatel](https://github.com/Pijukatel), closes [#891](https://github.com/apify/crawlee-python/issues/891) +- Add `block_requests` helper for `PlaywrightCrawler` ([#919](https://github.com/apify/crawlee-python/pull/919)) ([1030459](https://github.com/apify/crawlee-python/commit/103045994908f80cffee5ccfff91a040e0042f48)) by [@Mantisus](https://github.com/Mantisus), closes [#848](https://github.com/apify/crawlee-python/issues/848) +- Return request handlers from decorator methods to allow further decoration ([#934](https://github.com/apify/crawlee-python/pull/934)) ([9ec0aae](https://github.com/apify/crawlee-python/commit/9ec0aae54e2a340d29c893567ae80bf8bd4510a9)) by [@mylank](https://github.com/mylank) +- Add `transform_request_function` for `enqueue_links` ([#923](https://github.com/apify/crawlee-python/pull/923)) ([6b15957](https://github.com/apify/crawlee-python/commit/6b159578f612251e6d2253a72b6521430f4f9b09)) by [@Mantisus](https://github.com/Mantisus), closes [#894](https://github.com/apify/crawlee-python/issues/894) +- Add `time_remaining_secs` property to `MIGRATING` event data ([#940](https://github.com/apify/crawlee-python/pull/940)) ([b44501b](https://github.com/apify/crawlee-python/commit/b44501bcadbd12673a8f47aa92f12da8e404f60b)) by [@fnesveda](https://github.com/fnesveda) +- Add LogisticalRegressionPredictor - rendering type predictor for adaptive crawling ([#930](https://github.com/apify/crawlee-python/pull/930)) ([8440499](https://github.com/apify/crawlee-python/commit/8440499468db115a4c478e9bcdb692554d1655c5)) by [@Pijukatel](https://github.com/Pijukatel) + +### ๐Ÿ› Bug Fixes + +- Fix crawler not retrying user handler if there was timeout in the handler ([#909](https://github.com/apify/crawlee-python/pull/909)) ([f4090ef](https://github.com/apify/crawlee-python/commit/f4090ef0ea0281d53dab16a77ceea2ef6ac43d76)) by [@Pijukatel](https://github.com/Pijukatel), closes [#907](https://github.com/apify/crawlee-python/issues/907) +- Optimize memory consumption for `HttpxHttpClient`, fix proxy handling ([#905](https://github.com/apify/crawlee-python/pull/905)) ([d7ad480](https://github.com/apify/crawlee-python/commit/d7ad480834263ae0480049cb0a8db4dfc3946d8d)) by [@Mantisus](https://github.com/Mantisus), closes [#895](https://github.com/apify/crawlee-python/issues/895) +- Fix `BrowserPool` and `PlaywrightBrowserPlugin` closure ([#932](https://github.com/apify/crawlee-python/pull/932)) ([997543d](https://github.com/apify/crawlee-python/commit/997543d2fa5afba49929f4407ee95d7a4933a50d)) by [@Mantisus](https://github.com/Mantisus) + + +## [0.5.2](https://github.com/apify/crawlee-python/releases/tag/v0.5.2) (2025-01-17) + +### ๐Ÿ› Bug Fixes + +- Avoid `use_state` race conditions. Remove key argument to `use_state` ([#868](https://github.com/apify/crawlee-python/pull/868)) ([000b976](https://github.com/apify/crawlee-python/commit/000b9761211502d86a893a31e3ca21998a6e3b99)) by [@Pijukatel](https://github.com/Pijukatel), closes [#856](https://github.com/apify/crawlee-python/issues/856) +- Restore proxy functionality for PlaywrightCrawler broken in v0.5 ([#889](https://github.com/apify/crawlee-python/pull/889)) ([908c944](https://github.com/apify/crawlee-python/commit/908c944ff9b1fc8ed7eb35f0078a1de71e34d5c5)) by [@Mantisus](https://github.com/Mantisus), closes [#887](https://github.com/apify/crawlee-python/issues/887) +- Fix the usage of Configuration ([#899](https://github.com/apify/crawlee-python/pull/899)) ([0f1cf6f](https://github.com/apify/crawlee-python/commit/0f1cf6f0b52c92ca4e465a2a01f8111cd9ab42ec)) by [@vdusek](https://github.com/vdusek), closes [#670](https://github.com/apify/crawlee-python/issues/670) + + +## [0.5.1](https://github.com/apify/crawlee-python/releases/tag/v0.5.1) (2025-01-07) + +### ๐Ÿ› Bug Fixes + +- Make result of RequestList.is_empty independent of fetch_next_request calls ([#876](https://github.com/apify/crawlee-python/pull/876)) ([d50249e](https://github.com/apify/crawlee-python/commit/d50249ecbfe2a04f508fcdc3261e050349bd0da2)) by [@janbuchar](https://github.com/janbuchar) + + +## [0.5.0](https://github.com/apify/crawlee-python/releases/tag/v0.5.0) (2025-01-02) + +- Check out the [Release blog post](https://crawlee.dev/blog/crawlee-for-python-v05) for more details. +- Check out the [Upgrading guide](https://crawlee.dev/python/docs/upgrading/upgrading-to-v0x#upgrading-to-v05) to ensure a smooth update. + +### ๐Ÿš€ Features + +- Add possibility to use None as no proxy in tiered proxies ([#760](https://github.com/apify/crawlee-python/pull/760)) ([0fbd017](https://github.com/apify/crawlee-python/commit/0fbd01723b9fe2e3410e0f358cab2f22848b08d0)) by [@Pijukatel](https://github.com/Pijukatel), closes [#687](https://github.com/apify/crawlee-python/issues/687) +- Add `use_state` context method ([#682](https://github.com/apify/crawlee-python/pull/682)) ([868b41e](https://github.com/apify/crawlee-python/commit/868b41ebd4c8003fa60ab07887577d0fb85b6ecc)) by [@Mantisus](https://github.com/Mantisus), closes [#191](https://github.com/apify/crawlee-python/issues/191) +- Add pre-navigation hooks router to AbstractHttpCrawler ([#791](https://github.com/apify/crawlee-python/pull/791)) ([0f23205](https://github.com/apify/crawlee-python/commit/0f23205923065074c522b3de9d47218a204dfa78)) by [@Pijukatel](https://github.com/Pijukatel), closes [#635](https://github.com/apify/crawlee-python/issues/635) +- Add example of how to integrate Camoufox into PlaywrightCrawler ([#789](https://github.com/apify/crawlee-python/pull/789)) ([246cfc4](https://github.com/apify/crawlee-python/commit/246cfc4ebc8bce1d15e1dddd62d652bd65869328)) by [@Pijukatel](https://github.com/Pijukatel), closes [#684](https://github.com/apify/crawlee-python/issues/684) +- Expose event types, improve on/emit signature, allow parameterless listeners ([#800](https://github.com/apify/crawlee-python/pull/800)) ([c102c4c](https://github.com/apify/crawlee-python/commit/c102c4c894a00b09adfd5f4911563c81cf3e98b4)) by [@janbuchar](https://github.com/janbuchar), closes [#561](https://github.com/apify/crawlee-python/issues/561) +- Add stop method to BasicCrawler ([#807](https://github.com/apify/crawlee-python/pull/807)) ([6d01af4](https://github.com/apify/crawlee-python/commit/6d01af4231d02b4349a8719f5ed18d812843fde5)) by [@Pijukatel](https://github.com/Pijukatel), closes [#651](https://github.com/apify/crawlee-python/issues/651) +- Add `html_to_text` helper function ([#792](https://github.com/apify/crawlee-python/pull/792)) ([2b9d970](https://github.com/apify/crawlee-python/commit/2b9d97009dd653870681bb3cadbb46b214ff1a73)) by [@Pijukatel](https://github.com/Pijukatel), closes [#659](https://github.com/apify/crawlee-python/issues/659) +- [**breaking**] Implement `RequestManagerTandem`, remove `add_request` from `RequestList`, accept any iterable in `RequestList` constructor ([#777](https://github.com/apify/crawlee-python/pull/777)) ([4172652](https://github.com/apify/crawlee-python/commit/4172652079e5e91190c1cc5e2138fd41a7c84a6b)) by [@janbuchar](https://github.com/janbuchar) + +### ๐Ÿ› Bug Fixes + +- Fix circular import in `KeyValueStore` ([#805](https://github.com/apify/crawlee-python/pull/805)) ([8bdf49d](https://github.com/apify/crawlee-python/commit/8bdf49d1cb2a94b66f69fd1b77063a4113517fae)) by [@Mantisus](https://github.com/Mantisus), closes [#804](https://github.com/apify/crawlee-python/issues/804) +- [**breaking**] Refactor service usage to rely on `service_locator` ([#691](https://github.com/apify/crawlee-python/pull/691)) ([1d31c6c](https://github.com/apify/crawlee-python/commit/1d31c6c7e7a9ec7cee5b2de900568d9f77db65ba)) by [@vdusek](https://github.com/vdusek), closes [#369](https://github.com/apify/crawlee-python/issues/369), [#539](https://github.com/apify/crawlee-python/issues/539), [#699](https://github.com/apify/crawlee-python/issues/699) +- Pass `verify` in httpx client ([#802](https://github.com/apify/crawlee-python/pull/802)) ([074d083](https://github.com/apify/crawlee-python/commit/074d0836b55e52f13726e7cd1c21602623fda4fc)) by [@Mantisus](https://github.com/Mantisus), closes [#798](https://github.com/apify/crawlee-python/issues/798) +- Fix `page_options` for `PlaywrightBrowserPlugin` ([#796](https://github.com/apify/crawlee-python/pull/796)) ([bd3bdd4](https://github.com/apify/crawlee-python/commit/bd3bdd4046c2ddea62feb77322033cad50f382dd)) by [@Mantisus](https://github.com/Mantisus), closes [#755](https://github.com/apify/crawlee-python/issues/755) +- Fix event migrating handler in `RequestQueue` ([#825](https://github.com/apify/crawlee-python/pull/825)) ([fd6663f](https://github.com/apify/crawlee-python/commit/fd6663f903bc7eecd1000da89e06197b43dfb962)) by [@Mantisus](https://github.com/Mantisus), closes [#815](https://github.com/apify/crawlee-python/issues/815) +- Respect user configuration for work with status codes ([#812](https://github.com/apify/crawlee-python/pull/812)) ([8daf4bd](https://github.com/apify/crawlee-python/commit/8daf4bd49c1b09a0924f827daedebf7600ac609b)) by [@Mantisus](https://github.com/Mantisus), closes [#708](https://github.com/apify/crawlee-python/issues/708), [#756](https://github.com/apify/crawlee-python/issues/756) +- `abort-on-error` for successive runs ([#834](https://github.com/apify/crawlee-python/pull/834)) ([0cea673](https://github.com/apify/crawlee-python/commit/0cea67387bf366800b447de784af580159b199ee)) by [@Mantisus](https://github.com/Mantisus) +- Relax ServiceLocator restrictions ([#837](https://github.com/apify/crawlee-python/pull/837)) ([aa3667f](https://github.com/apify/crawlee-python/commit/aa3667f344d78945df3eca77431e1409f43f8bb5)) by [@janbuchar](https://github.com/janbuchar), closes [#806](https://github.com/apify/crawlee-python/issues/806) +- Fix typo in exports ([#841](https://github.com/apify/crawlee-python/pull/841)) ([8fa6ac9](https://github.com/apify/crawlee-python/commit/8fa6ac994fe4f3f6430cb796a0c6a732c93c672b)) by [@janbuchar](https://github.com/janbuchar) + +### ๐Ÿšœ Refactor + +- [**breaking**] Refactor HttpCrawler, BeautifulSoupCrawler, ParselCrawler inheritance ([#746](https://github.com/apify/crawlee-python/pull/746)) ([9d3c269](https://github.com/apify/crawlee-python/commit/9d3c2697c91ce93028ca86a91d85d465d36c1ad7)) by [@Pijukatel](https://github.com/Pijukatel), closes [#350](https://github.com/apify/crawlee-python/issues/350) +- [**breaking**] Remove `json_` and `order_no` from `Request` ([#788](https://github.com/apify/crawlee-python/pull/788)) ([5381d13](https://github.com/apify/crawlee-python/commit/5381d13aa51a757fc1906f400788555df090a1af)) by [@Mantisus](https://github.com/Mantisus), closes [#94](https://github.com/apify/crawlee-python/issues/94) +- [**breaking**] Rename PwPreNavContext to PwPreNavCrawlingContext ([#827](https://github.com/apify/crawlee-python/pull/827)) ([84b61a3](https://github.com/apify/crawlee-python/commit/84b61a3d25bee42faed4e81cd156663f251b3d3d)) by [@vdusek](https://github.com/vdusek) +- [**breaking**] Rename PlaywrightCrawler kwargs: browser_options, page_options ([#831](https://github.com/apify/crawlee-python/pull/831)) ([ffc6048](https://github.com/apify/crawlee-python/commit/ffc6048e9dc5c5e862271fa50c48bb0fb6f0a18f)) by [@Pijukatel](https://github.com/Pijukatel) +- [**breaking**] Update the crawlers & storage clients structure ([#828](https://github.com/apify/crawlee-python/pull/828)) ([0ba04d1](https://github.com/apify/crawlee-python/commit/0ba04d1633881043928a408678932c46fb90e21f)) by [@vdusek](https://github.com/vdusek), closes [#764](https://github.com/apify/crawlee-python/issues/764) + + +## [0.4.5](https://github.com/apify/crawlee-python/releases/tag/v0.4.5) (2024-12-06) + +### ๐Ÿš€ Features + +- Improve project bootstrapping ([#538](https://github.com/apify/crawlee-python/pull/538)) ([367899c](https://github.com/apify/crawlee-python/commit/367899cbad5021674f6e41c4dd7eb2266fe043aa)) by [@janbuchar](https://github.com/janbuchar), closes [#317](https://github.com/apify/crawlee-python/issues/317), [#414](https://github.com/apify/crawlee-python/issues/414), [#495](https://github.com/apify/crawlee-python/issues/495), [#511](https://github.com/apify/crawlee-python/issues/511) + +### ๐Ÿ› Bug Fixes + +- Add upper bound of HTTPX version ([#775](https://github.com/apify/crawlee-python/pull/775)) ([b59e34d](https://github.com/apify/crawlee-python/commit/b59e34d6301e26825d88608152ffb337ef602a9f)) by [@vdusek](https://github.com/vdusek) +- Fix incorrect use of desired concurrency ratio ([#780](https://github.com/apify/crawlee-python/pull/780)) ([d1f8bfb](https://github.com/apify/crawlee-python/commit/d1f8bfb68ce2ef13b550ce415a3689858112a4c7)) by [@Pijukatel](https://github.com/Pijukatel), closes [#759](https://github.com/apify/crawlee-python/issues/759) +- Remove pydantic constraint <2.10.0 and update timedelta validator, serializer type hints ([#757](https://github.com/apify/crawlee-python/pull/757)) ([c0050c0](https://github.com/apify/crawlee-python/commit/c0050c0ee76e5deb28f174ecf276b0e6abf68b9d)) by [@Pijukatel](https://github.com/Pijukatel) + + +## [0.4.4](https://github.com/apify/crawlee-python/releases/tag/v0.4.4) (2024-11-29) + +### ๐Ÿš€ Features + +- Expose browser_options and page_options to PlaywrightCrawler ([#730](https://github.com/apify/crawlee-python/pull/730)) ([dbe85b9](https://github.com/apify/crawlee-python/commit/dbe85b90e59def281cfc6617a0eb869a4adf2fc0)) by [@vdusek](https://github.com/vdusek), closes [#719](https://github.com/apify/crawlee-python/issues/719) +- Add `abort_on_error` property ([#731](https://github.com/apify/crawlee-python/pull/731)) ([6dae03a](https://github.com/apify/crawlee-python/commit/6dae03a68a2d23c68c78d8d44611d43e40eb9404)) by [@Mantisus](https://github.com/Mantisus), closes [#704](https://github.com/apify/crawlee-python/issues/704) + +### ๐Ÿ› Bug Fixes + +- Fix init of context managers and context handling in `BasicCrawler` ([#714](https://github.com/apify/crawlee-python/pull/714)) ([486fe6d](https://github.com/apify/crawlee-python/commit/486fe6d6cd56cb560ab51a32ec0286d9e32267cb)) by [@vdusek](https://github.com/vdusek) + + +## [0.4.3](https://github.com/apify/crawlee-python/releases/tag/v0.4.3) (2024-11-21) + +### ๐Ÿ› Bug Fixes + +- Pydantic 2.10.0 issues ([#716](https://github.com/apify/crawlee-python/pull/716)) ([8d8b3fc](https://github.com/apify/crawlee-python/commit/8d8b3fcff8be10edf5351f5324c7ba112c1d2ba0)) by [@Pijukatel](https://github.com/Pijukatel) + + +## [0.4.2](https://github.com/apify/crawlee-python/releases/tag/v0.4.2) (2024-11-20) + +### ๐Ÿ› Bug Fixes + +- Respect custom HTTP headers in `PlaywrightCrawler` ([#685](https://github.com/apify/crawlee-python/pull/685)) ([a84125f](https://github.com/apify/crawlee-python/commit/a84125f031347426de44b8f015c87882c8f96f72)) by [@Mantisus](https://github.com/Mantisus) +- Fix serialization payload in Request. Fix Docs for Post Request ([#683](https://github.com/apify/crawlee-python/pull/683)) ([e8b4d2d](https://github.com/apify/crawlee-python/commit/e8b4d2d4989fd9967403b828c914cb7ae2ef9b8b)) by [@Mantisus](https://github.com/Mantisus), closes [#668](https://github.com/apify/crawlee-python/issues/668) +- Accept string payload in the Request constructor ([#697](https://github.com/apify/crawlee-python/pull/697)) ([19f5add](https://github.com/apify/crawlee-python/commit/19f5addc0223d68389eea47864830c709335ab6e)) by [@vdusek](https://github.com/vdusek) +- Fix snapshots handling ([#692](https://github.com/apify/crawlee-python/pull/692)) ([4016c0d](https://github.com/apify/crawlee-python/commit/4016c0d8121a8950ab1df22188eac838a011c39f)) by [@Pijukatel](https://github.com/Pijukatel) + + +## [0.4.1](https://github.com/apify/crawlee-python/releases/tag/v0.4.1) (2024-11-11) + +### ๐Ÿš€ Features + +- Add `max_crawl_depth` option to `BasicCrawler` ([#637](https://github.com/apify/crawlee-python/pull/637)) ([77deaa9](https://github.com/apify/crawlee-python/commit/77deaa964e2c1e74af1c5117a13d8d8257f0e27e)) by [@Prathamesh010](https://github.com/Prathamesh010), closes [#460](https://github.com/apify/crawlee-python/issues/460) +- Add BeautifulSoupParser type alias ([#674](https://github.com/apify/crawlee-python/pull/674)) ([b2cf88f](https://github.com/apify/crawlee-python/commit/b2cf88ffea8d75808c9210850a03fcc70b0b9e3d)) by [@Pijukatel](https://github.com/Pijukatel) + +### ๐Ÿ› Bug Fixes + +- Fix total_size usage in memory size monitoring ([#661](https://github.com/apify/crawlee-python/pull/661)) ([c2a3239](https://github.com/apify/crawlee-python/commit/c2a32397eecd5cc7f412c2af7269b004a8b2eaf2)) by [@janbuchar](https://github.com/janbuchar) +- Add HttpHeaders to module exports ([#664](https://github.com/apify/crawlee-python/pull/664)) ([f0c5ca7](https://github.com/apify/crawlee-python/commit/f0c5ca717d9f9e304d375da2c23552c26ca870da)) by [@vdusek](https://github.com/vdusek), closes [#663](https://github.com/apify/crawlee-python/issues/663) +- Fix unhandled ValueError in request handler result processing ([#666](https://github.com/apify/crawlee-python/pull/666)) ([0a99d7f](https://github.com/apify/crawlee-python/commit/0a99d7f693245eb9a065016fb6f2d268f6956805)) by [@janbuchar](https://github.com/janbuchar) +- Fix BaseDatasetClient.iter_items type hints ([#680](https://github.com/apify/crawlee-python/pull/680)) ([a968b1b](https://github.com/apify/crawlee-python/commit/a968b1be6fceb56676b0198a044c8fceac7c92a6)) by [@Pijukatel](https://github.com/Pijukatel) + + +## [0.4.0](https://github.com/apify/crawlee-python/releases/tag/v0.4.0) (2024-11-01) + +- Check out the [Upgrading guide](https://crawlee.dev/python/docs/upgrading/upgrading-to-v0x#upgrading-to-v04) to ensure a smooth update. + +### ๐Ÿš€ Features + +- [**breaking**] Add headers in unique key computation ([#609](https://github.com/apify/crawlee-python/pull/609)) ([6c4746f](https://github.com/apify/crawlee-python/commit/6c4746fa8ff86952a812b32a1d70dc910e76b43e)) by [@Prathamesh010](https://github.com/Prathamesh010), closes [#548](https://github.com/apify/crawlee-python/issues/548) +- Add `pre_navigation_hooks` to `PlaywrightCrawler` ([#631](https://github.com/apify/crawlee-python/pull/631)) ([5dd5b60](https://github.com/apify/crawlee-python/commit/5dd5b60e2a44d5bd3748b613790e1bee3232d6f3)) by [@Prathamesh010](https://github.com/Prathamesh010), closes [#427](https://github.com/apify/crawlee-python/issues/427) +- Add `always_enqueue` option to bypass URL deduplication ([#621](https://github.com/apify/crawlee-python/pull/621)) ([4e59fa4](https://github.com/apify/crawlee-python/commit/4e59fa46daaec05e52262cf62c26f28ddcd772af)) by [@Rutam21](https://github.com/Rutam21), closes [#547](https://github.com/apify/crawlee-python/issues/547) +- Split and add extra configuration to export_data method ([#580](https://github.com/apify/crawlee-python/pull/580)) ([6751635](https://github.com/apify/crawlee-python/commit/6751635e1785a4a27f60092c82f5dd0c40193d52)) by [@deshansh](https://github.com/deshansh), closes [#526](https://github.com/apify/crawlee-python/issues/526) + +### ๐Ÿ› Bug Fixes + +- Use strip in headers normalization ([#614](https://github.com/apify/crawlee-python/pull/614)) ([a15b21e](https://github.com/apify/crawlee-python/commit/a15b21e51deaf2b67738f95bc2b15c1c16d1775f)) by [@vdusek](https://github.com/vdusek) +- [**breaking**] Merge payload and data fields of Request ([#542](https://github.com/apify/crawlee-python/pull/542)) ([d06fcef](https://github.com/apify/crawlee-python/commit/d06fcef3fee44616ded5f587b9c7313b82a57cc7)) by [@vdusek](https://github.com/vdusek), closes [#560](https://github.com/apify/crawlee-python/issues/560) +- Default ProxyInfo port if httpx.URL port is None ([#619](https://github.com/apify/crawlee-python/pull/619)) ([8107a6f](https://github.com/apify/crawlee-python/commit/8107a6f97e8f16a330e7d02d3fc6ea34c5f78d77)) by [@steffansafey](https://github.com/steffansafey), closes [#618](https://github.com/apify/crawlee-python/issues/618) + +### โš™๏ธ Miscellaneous Tasks + +- [**breaking**] Remove Request.query_params field ([#639](https://github.com/apify/crawlee-python/pull/639)) ([6ec0ec4](https://github.com/apify/crawlee-python/commit/6ec0ec4fa0cef9b8bf893e70d99f068675c9c54c)) by [@vdusek](https://github.com/vdusek), closes [#615](https://github.com/apify/crawlee-python/issues/615) + + +## [0.3.9](https://github.com/apify/crawlee-python/releases/tag/v0.3.9) (2024-10-23) + +### ๐Ÿš€ Features + +- Key-value store context helpers ([#584](https://github.com/apify/crawlee-python/pull/584)) ([fc15622](https://github.com/apify/crawlee-python/commit/fc156222c3747fc4cc7bd7666a21769845c7d0d5)) by [@janbuchar](https://github.com/janbuchar) +- Added get_public_url method to KeyValueStore ([#572](https://github.com/apify/crawlee-python/pull/572)) ([3a4ba8f](https://github.com/apify/crawlee-python/commit/3a4ba8f459903b6288aec40de2c3ca862e36abec)) by [@akshay11298](https://github.com/akshay11298), closes [#514](https://github.com/apify/crawlee-python/issues/514) + +### ๐Ÿ› Bug Fixes + +- Workaround for JSON value typing problems ([#581](https://github.com/apify/crawlee-python/pull/581)) ([403496a](https://github.com/apify/crawlee-python/commit/403496a53c12810351139a6e073238143ecc5930)) by [@janbuchar](https://github.com/janbuchar), closes [#563](https://github.com/apify/crawlee-python/issues/563) + + +## [0.3.8](https://github.com/apify/crawlee-python/releases/tag/v0.3.8) (2024-10-02) + +### ๐Ÿš€ Features + +- Mask Playwright's "headless" headers ([#545](https://github.com/apify/crawlee-python/pull/545)) ([d1445e4](https://github.com/apify/crawlee-python/commit/d1445e4858fd804bb4a2e35efa1d2f5254d8df6b)) by [@vdusek](https://github.com/vdusek), closes [#401](https://github.com/apify/crawlee-python/issues/401) +- Add new model for `HttpHeaders` ([#544](https://github.com/apify/crawlee-python/pull/544)) ([854f2c1](https://github.com/apify/crawlee-python/commit/854f2c1e2e09cf398e04b1e153534282add1247e)) by [@vdusek](https://github.com/vdusek) + +### ๐Ÿ› Bug Fixes + +- Call `error_handler` for `SessionError` ([#557](https://github.com/apify/crawlee-python/pull/557)) ([e75ac4b](https://github.com/apify/crawlee-python/commit/e75ac4b70cd48a4ca9f8245cea3c5f3c188b8824)) by [@vdusek](https://github.com/vdusek), closes [#546](https://github.com/apify/crawlee-python/issues/546) +- Extend from `StrEnum` in `RequestState` to fix serialization ([#556](https://github.com/apify/crawlee-python/pull/556)) ([6bf35ba](https://github.com/apify/crawlee-python/commit/6bf35ba4a6913819706ebd1d2c1156a4c62f944e)) by [@vdusek](https://github.com/vdusek), closes [#551](https://github.com/apify/crawlee-python/issues/551) +- Add equality check to UserData model ([#562](https://github.com/apify/crawlee-python/pull/562)) ([899a25c](https://github.com/apify/crawlee-python/commit/899a25ca63f570b3c4d8d56c85a838b371fd3924)) by [@janbuchar](https://github.com/janbuchar) + + +## [0.3.7](https://github.com/apify/crawlee-python/releases/tag/v0.3.7) (2024-09-25) + +### ๐Ÿ› Bug Fixes + +- Improve `Request.user_data` serialization ([#540](https://github.com/apify/crawlee-python/pull/540)) ([de29c0e](https://github.com/apify/crawlee-python/commit/de29c0e6b737a9d2544c5382472618dde76eb2a5)) by [@janbuchar](https://github.com/janbuchar), closes [#524](https://github.com/apify/crawlee-python/issues/524) +- Adopt new version of curl-cffi ([#543](https://github.com/apify/crawlee-python/pull/543)) ([f6fcf48](https://github.com/apify/crawlee-python/commit/f6fcf48d99bfcb4b8e75c5c9c38dc8c265164a10)) by [@vdusek](https://github.com/vdusek) + + +## [0.3.6](https://github.com/apify/crawlee-python/releases/tag/v0.3.6) (2024-09-19) + +### ๐Ÿš€ Features + +- Add HTTP/2 support for HTTPX client ([#513](https://github.com/apify/crawlee-python/pull/513)) ([0eb0a33](https://github.com/apify/crawlee-python/commit/0eb0a33411096011198e52c393f35730f1a0b6ac)) by [@vdusek](https://github.com/vdusek), closes [#512](https://github.com/apify/crawlee-python/issues/512) +- Expose extended unique key when creating a new Request ([#515](https://github.com/apify/crawlee-python/pull/515)) ([1807f41](https://github.com/apify/crawlee-python/commit/1807f419e47a815dd706d09acb0f3b3af8cfc691)) by [@vdusek](https://github.com/vdusek) +- Add header generator and integrate it into HTTPX client ([#530](https://github.com/apify/crawlee-python/pull/530)) ([b63f9f9](https://github.com/apify/crawlee-python/commit/b63f9f98c6613e095546ef544eab271d433e3379)) by [@vdusek](https://github.com/vdusek), closes [#402](https://github.com/apify/crawlee-python/issues/402) + +### ๐Ÿ› Bug Fixes + +- Use explicitly UTF-8 encoding in local storage ([#533](https://github.com/apify/crawlee-python/pull/533)) ([a3a0ab2](https://github.com/apify/crawlee-python/commit/a3a0ab2f6809b7a06319a77dfbf289df78638dea)) by [@vdusek](https://github.com/vdusek), closes [#532](https://github.com/apify/crawlee-python/issues/532) + + +## [0.3.5](https://github.com/apify/crawlee-python/releases/tag/v0.3.5) (2024-09-10) + +### ๐Ÿš€ Features + +- Memory usage limit configuration via environment variables ([#502](https://github.com/apify/crawlee-python/pull/502)) ([c62e554](https://github.com/apify/crawlee-python/commit/c62e5545de6a1836f0514ebd3dd695e4fd856844)) by [@janbuchar](https://github.com/janbuchar) + +### ๐Ÿ› Bug Fixes + +- Http clients detect 4xx as errors by default ([#498](https://github.com/apify/crawlee-python/pull/498)) ([1895dca](https://github.com/apify/crawlee-python/commit/1895dca538f415feca37b4a030525c7c0d32f114)) by [@vdusek](https://github.com/vdusek), closes [#496](https://github.com/apify/crawlee-python/issues/496) +- Correctly handle log level configuration ([#508](https://github.com/apify/crawlee-python/pull/508)) ([7ea8fe6](https://github.com/apify/crawlee-python/commit/7ea8fe69f4a6146a1e417bebff60c08a85e2ca27)) by [@janbuchar](https://github.com/janbuchar) + + +## [0.3.4](https://github.com/apify/crawlee-python/releases/tag/v0.3.4) (2024-09-05) + +### ๐Ÿ› Bug Fixes + +- Expose basic crawling context ([#501](https://github.com/apify/crawlee-python/pull/501)) ([b484535](https://github.com/apify/crawlee-python/commit/b484535dbacc5d206a026f55a1d3e58edd375e91)) by [@vdusek](https://github.com/vdusek) + + +## [0.3.3](https://github.com/apify/crawlee-python/releases/tag/v0.3.3) (2024-09-05) + +### ๐Ÿ› Bug Fixes + +- Deduplicate requests by unique key before submitting them to the queue ([#499](https://github.com/apify/crawlee-python/pull/499)) ([6a3e0e7](https://github.com/apify/crawlee-python/commit/6a3e0e78490851c43cefb0497ce34ca52a31a25c)) by [@janbuchar](https://github.com/janbuchar) + + +## [0.3.2](https://github.com/apify/crawlee-python/releases/tag/v0.3.2) (2024-09-02) + +### ๐Ÿ› Bug Fixes + +- Double incrementation of `item_count` ([#443](https://github.com/apify/crawlee-python/pull/443)) ([cd9adf1](https://github.com/apify/crawlee-python/commit/cd9adf15731e8c4a39cb142b6d1a62909cafdc51)) by [@cadlagtrader](https://github.com/cadlagtrader), closes [#442](https://github.com/apify/crawlee-python/issues/442) +- Field alias in `BatchRequestsOperationResponse` ([#485](https://github.com/apify/crawlee-python/pull/485)) ([126a862](https://github.com/apify/crawlee-python/commit/126a8629cb5b989a0f9fe22156fb09731a34acd2)) by [@janbuchar](https://github.com/janbuchar) +- JSON handling with Parsel ([#490](https://github.com/apify/crawlee-python/pull/490)) ([ebf5755](https://github.com/apify/crawlee-python/commit/ebf575539ffb631ae131a1b801cec8f21dd0cf4c)) by [@janbuchar](https://github.com/janbuchar), closes [#488](https://github.com/apify/crawlee-python/issues/488) + + +## [0.3.1](https://github.com/apify/crawlee-python/releases/tag/v0.3.1) (2024-08-30) + +### ๐Ÿš€ Features + +- Curl http client selects chrome impersonation by default ([#473](https://github.com/apify/crawlee-python/pull/473)) ([82dc939](https://github.com/apify/crawlee-python/commit/82dc93957b1a380ea975564dea5c6ba4639be548)) by [@vdusek](https://github.com/vdusek) + + +## [0.3.0](https://github.com/apify/crawlee-python/releases/tag/v0.3.0) (2024-08-27) + +- Check out the [Upgrading guide](https://crawlee.dev/python/docs/upgrading/upgrading-to-v0x#upgrading-to-v03) to ensure a smooth update. + +### ๐Ÿš€ Features + +- Implement ParselCrawler that adds support for Parsel ([#348](https://github.com/apify/crawlee-python/pull/348)) ([a3832e5](https://github.com/apify/crawlee-python/commit/a3832e527f022f32cce4a80055da3b7967b74522)) by [@asymness](https://github.com/asymness), closes [#335](https://github.com/apify/crawlee-python/issues/335) +- Add support for filling a web form ([#453](https://github.com/apify/crawlee-python/pull/453)) ([5a125b4](https://github.com/apify/crawlee-python/commit/5a125b464b2619000b92dacad4c3a7faa1869f29)) by [@vdusek](https://github.com/vdusek), closes [#305](https://github.com/apify/crawlee-python/issues/305) + +### ๐Ÿ› Bug Fixes + +- Remove indentation from statistics logging and print the data in tables ([#322](https://github.com/apify/crawlee-python/pull/322)) ([359b515](https://github.com/apify/crawlee-python/commit/359b515d647f064886f91441c2c01d3099e21035)) by [@TymeeK](https://github.com/TymeeK), closes [#306](https://github.com/apify/crawlee-python/issues/306) +- Remove redundant log, fix format ([#408](https://github.com/apify/crawlee-python/pull/408)) ([8d27e39](https://github.com/apify/crawlee-python/commit/8d27e3928c605d6eceb51a948453a15024fa2aa2)) by [@janbuchar](https://github.com/janbuchar) +- Dequeue items from RequestQueue in the correct order ([#411](https://github.com/apify/crawlee-python/pull/411)) ([96fc33e](https://github.com/apify/crawlee-python/commit/96fc33e2cc4631cae3c50dad9eace6407103a2a9)) by [@janbuchar](https://github.com/janbuchar) +- Relative URLS supports & If not a URL, pass #417 ([#431](https://github.com/apify/crawlee-python/pull/431)) ([ccd8145](https://github.com/apify/crawlee-python/commit/ccd81454166ece68391cdffedb8efe9e663361d9)) by [@black7375](https://github.com/black7375), closes [#417](https://github.com/apify/crawlee-python/issues/417) +- Typo in ProlongRequestLockResponse ([#458](https://github.com/apify/crawlee-python/pull/458)) ([30ccc3a](https://github.com/apify/crawlee-python/commit/30ccc3a4763bc3706a3bbeaedc95f9648f5ba09a)) by [@janbuchar](https://github.com/janbuchar) +- Add missing __all__ to top-level __init__.py file ([#463](https://github.com/apify/crawlee-python/pull/463)) ([353a1ce](https://github.com/apify/crawlee-python/commit/353a1ce28cd38c97ffb36dc1e6b0e86d3aef1a48)) by [@janbuchar](https://github.com/janbuchar) + +### ๐Ÿšœ Refactor + +- [**breaking**] RequestQueue and service management rehaul ([#429](https://github.com/apify/crawlee-python/pull/429)) ([b155a9f](https://github.com/apify/crawlee-python/commit/b155a9f602a163e891777bef5608072fb5d0156f)) by [@janbuchar](https://github.com/janbuchar), closes [#83](https://github.com/apify/crawlee-python/issues/83), [#174](https://github.com/apify/crawlee-python/issues/174), [#203](https://github.com/apify/crawlee-python/issues/203), [#423](https://github.com/apify/crawlee-python/issues/423) +- [**breaking**] Declare private and public interface ([#456](https://github.com/apify/crawlee-python/pull/456)) ([d6738df](https://github.com/apify/crawlee-python/commit/d6738df30586934e8d1aba50b9cd437a0ea40400)) by [@vdusek](https://github.com/vdusek) + + +## [0.2.1](https://github.com/apify/crawlee-python/releases/tag/v0.2.1) (2024-08-05) + +### ๐Ÿ› Bug Fixes + +- Do not import curl impersonate in http clients init ([#396](https://github.com/apify/crawlee-python/pull/396)) ([3bb8009](https://github.com/apify/crawlee-python/commit/3bb80093e61c1615f869ecd5ab80b061e0e5db36)) by [@vdusek](https://github.com/vdusek) + + +## [0.2.0](https://github.com/apify/crawlee-python/releases/tag/v0.2.0) (2024-08-05) + +### ๐Ÿš€ Features + +- Add new curl impersonate HTTP client ([#387](https://github.com/apify/crawlee-python/pull/387)) ([9c06260](https://github.com/apify/crawlee-python/commit/9c06260c0ee958522caa9322001a3186e9e43af4)) by [@vdusek](https://github.com/vdusek), closes [#292](https://github.com/apify/crawlee-python/issues/292) +- **playwright:** `infinite_scroll` helper ([#393](https://github.com/apify/crawlee-python/pull/393)) ([34f74bd](https://github.com/apify/crawlee-python/commit/34f74bdcffb42a6c876a856e1c89923d9b3e60bd)) by [@janbuchar](https://github.com/janbuchar) + + +## [0.1.2](https://github.com/apify/crawlee-python/releases/tag/v0.1.2) (2024-07-30) + +### ๐Ÿš€ Features + +- Add URL validation ([#343](https://github.com/apify/crawlee-python/pull/343)) ([1514538](https://github.com/apify/crawlee-python/commit/15145388009c85ab54dc72ea8f2d07efd78f80fd)) by [@vdusek](https://github.com/vdusek), closes [#300](https://github.com/apify/crawlee-python/issues/300) + +### ๐Ÿ› Bug Fixes + +- Minor log fix ([#341](https://github.com/apify/crawlee-python/pull/341)) ([0688bf1](https://github.com/apify/crawlee-python/commit/0688bf1860534ab6b2a85dc850bf3d56507ab154)) by [@souravjain540](https://github.com/souravjain540) +- Also use error_handler for context pipeline errors ([#331](https://github.com/apify/crawlee-python/pull/331)) ([7a66445](https://github.com/apify/crawlee-python/commit/7a664456b45c7e429b4c90aaf1c09d5796b93e3d)) by [@janbuchar](https://github.com/janbuchar), closes [#296](https://github.com/apify/crawlee-python/issues/296) +- Strip whitespace from href in enqueue_links ([#346](https://github.com/apify/crawlee-python/pull/346)) ([8a3174a](https://github.com/apify/crawlee-python/commit/8a3174aed24f9eb4f9ac415a79a58685a081cde2)) by [@janbuchar](https://github.com/janbuchar), closes [#337](https://github.com/apify/crawlee-python/issues/337) +- Warn instead of crashing when an empty dataset is being exported ([#342](https://github.com/apify/crawlee-python/pull/342)) ([22b95d1](https://github.com/apify/crawlee-python/commit/22b95d1948d4acd23a010898fa6af2f491e7f514)) by [@janbuchar](https://github.com/janbuchar), closes [#334](https://github.com/apify/crawlee-python/issues/334) +- Avoid Github rate limiting in project bootstrapping test ([#364](https://github.com/apify/crawlee-python/pull/364)) ([992f07f](https://github.com/apify/crawlee-python/commit/992f07f266f7b8433d99e9a179f277995f81eb17)) by [@janbuchar](https://github.com/janbuchar) +- Pass crawler configuration to storages ([#375](https://github.com/apify/crawlee-python/pull/375)) ([b2d3a52](https://github.com/apify/crawlee-python/commit/b2d3a52712abe21f4a4a5db4e20c80afe72c27de)) by [@janbuchar](https://github.com/janbuchar) +- Purge request queue on repeated crawler runs ([#377](https://github.com/apify/crawlee-python/pull/377)) ([7ad3d69](https://github.com/apify/crawlee-python/commit/7ad3d6908e153c590bff72478af7ee3239a249bc)) by [@janbuchar](https://github.com/janbuchar), closes [#152](https://github.com/apify/crawlee-python/issues/152) + + +## [0.1.1](https://github.com/apify/crawlee-python/releases/tag/v0.1.1) (2024-07-19) + +### ๐Ÿš€ Features + +- Expose crawler log ([#316](https://github.com/apify/crawlee-python/pull/316)) ([ae475fa](https://github.com/apify/crawlee-python/commit/ae475fa450c4fe053620d7b7eb475f3d58804674)) by [@vdusek](https://github.com/vdusek), closes [#303](https://github.com/apify/crawlee-python/issues/303) +- Integrate proxies into `PlaywrightCrawler` ([#325](https://github.com/apify/crawlee-python/pull/325)) ([2e072b6](https://github.com/apify/crawlee-python/commit/2e072b6ad7d5d82d96a7b489cafb87e7bfaf6e83)) by [@vdusek](https://github.com/vdusek) +- Blocking detection for playwright crawler ([#328](https://github.com/apify/crawlee-python/pull/328)) ([49ff6e2](https://github.com/apify/crawlee-python/commit/49ff6e25c12a97550eee718d64bb4130f9990189)) by [@vdusek](https://github.com/vdusek), closes [#239](https://github.com/apify/crawlee-python/issues/239) + +### ๐Ÿ› Bug Fixes + +- Pylance reportPrivateImportUsage errors ([#313](https://github.com/apify/crawlee-python/pull/313)) ([09d7203](https://github.com/apify/crawlee-python/commit/09d72034d5db8c47f461111ec093761935a3e2ef)) by [@vdusek](https://github.com/vdusek), closes [#283](https://github.com/apify/crawlee-python/issues/283) +- Set httpx logging to warning ([#314](https://github.com/apify/crawlee-python/pull/314)) ([1585def](https://github.com/apify/crawlee-python/commit/1585defffb2c0c844fab39bbc0e0b793d6169cbf)) by [@vdusek](https://github.com/vdusek), closes [#302](https://github.com/apify/crawlee-python/issues/302) +- Byte size serialization in MemoryInfo ([#245](https://github.com/apify/crawlee-python/pull/245)) ([a030174](https://github.com/apify/crawlee-python/commit/a0301746c2df076d281708344fb906e1c42e0790)) by [@janbuchar](https://github.com/janbuchar) +- Project bootstrapping in existing folder ([#318](https://github.com/apify/crawlee-python/pull/318)) ([c630818](https://github.com/apify/crawlee-python/commit/c630818538e0c37217ab73f6c6da05505ed8b364)) by [@janbuchar](https://github.com/janbuchar), closes [#301](https://github.com/apify/crawlee-python/issues/301) + + +## [0.1.0](https://github.com/apify/crawlee-python/releases/tag/v0.1.0) (2024-07-08) + +### ๐Ÿš€ Features + +- Project templates ([#237](https://github.com/apify/crawlee-python/pull/237)) ([c23c12c](https://github.com/apify/crawlee-python/commit/c23c12c66688f825f74deb39702f07cc6c6bbc46)) by [@janbuchar](https://github.com/janbuchar), closes [#215](https://github.com/apify/crawlee-python/issues/215) + +### ๐Ÿ› Bug Fixes + +- CLI UX improvements ([#271](https://github.com/apify/crawlee-python/pull/271)) ([123d515](https://github.com/apify/crawlee-python/commit/123d515b224c663577bfe0fab387d0aa11e5e4d4)) by [@janbuchar](https://github.com/janbuchar), closes [#267](https://github.com/apify/crawlee-python/issues/267) +- Error handling in CLI and templates documentation ([#273](https://github.com/apify/crawlee-python/pull/273)) ([61083c3](https://github.com/apify/crawlee-python/commit/61083c33434d431a118538f15bfa9a68c312ab03)) by [@vdusek](https://github.com/vdusek), closes [#268](https://github.com/apify/crawlee-python/issues/268) + + +## [0.0.7](https://github.com/apify/crawlee-python/releases/tag/v0.0.7) (2024-06-27) + +### ๐Ÿ› Bug Fixes + +- Do not wait for consistency in request queue ([#235](https://github.com/apify/crawlee-python/pull/235)) ([03ff138](https://github.com/apify/crawlee-python/commit/03ff138aadaf8e915abc7fafb854fe12947b9696)) by [@vdusek](https://github.com/vdusek) +- Selector handling in BeautifulSoupCrawler enqueue_links ([#231](https://github.com/apify/crawlee-python/pull/231)) ([896501e](https://github.com/apify/crawlee-python/commit/896501edb44f801409fec95cb3e5f2bcfcb4188d)) by [@janbuchar](https://github.com/janbuchar), closes [#230](https://github.com/apify/crawlee-python/issues/230) +- Handle blocked request ([#234](https://github.com/apify/crawlee-python/pull/234)) ([f8ef79f](https://github.com/apify/crawlee-python/commit/f8ef79ffcb7410713182af716d37dbbaad66fdbc)) by [@Mantisus](https://github.com/Mantisus) +- Improve AutoscaledPool state management ([#241](https://github.com/apify/crawlee-python/pull/241)) ([fdea3d1](https://github.com/apify/crawlee-python/commit/fdea3d16b13afe70039d864de861486c760aa0ba)) by [@janbuchar](https://github.com/janbuchar), closes [#236](https://github.com/apify/crawlee-python/issues/236) + + +## [0.0.6](https://github.com/apify/crawlee-python/releases/tag/v0.0.6) (2024-06-25) + +### ๐Ÿš€ Features + +- Maintain a global configuration instance ([#207](https://github.com/apify/crawlee-python/pull/207)) ([e003aa6](https://github.com/apify/crawlee-python/commit/e003aa63d859bec8199d0c890b5c9604f163ccd3)) by [@janbuchar](https://github.com/janbuchar) +- Add max requests per crawl to `BasicCrawler` ([#198](https://github.com/apify/crawlee-python/pull/198)) ([b5b3053](https://github.com/apify/crawlee-python/commit/b5b3053f43381601274e4034d07b4bf41720c7c2)) by [@vdusek](https://github.com/vdusek) +- Add support decompress *br* response content ([#226](https://github.com/apify/crawlee-python/pull/226)) ([a3547b9](https://github.com/apify/crawlee-python/commit/a3547b9c882dc5333a4fcd1223687ef85e79138d)) by [@Mantisus](https://github.com/Mantisus) +- BasicCrawler.export_data helper ([#222](https://github.com/apify/crawlee-python/pull/222)) ([237ec78](https://github.com/apify/crawlee-python/commit/237ec789b7dccc17cc57ef47ec56bcf73c6ca006)) by [@janbuchar](https://github.com/janbuchar), closes [#211](https://github.com/apify/crawlee-python/issues/211) +- Automatic logging setup ([#229](https://github.com/apify/crawlee-python/pull/229)) ([a67b72f](https://github.com/apify/crawlee-python/commit/a67b72faacd75674071bae496d59e1c60636350c)) by [@janbuchar](https://github.com/janbuchar), closes [#214](https://github.com/apify/crawlee-python/issues/214) + +### ๐Ÿ› Bug Fixes + +- Handling of relative URLs in add_requests ([#213](https://github.com/apify/crawlee-python/pull/213)) ([8aa8c57](https://github.com/apify/crawlee-python/commit/8aa8c57f44149caa0e01950a5d773726f261699a)) by [@janbuchar](https://github.com/janbuchar), closes [#202](https://github.com/apify/crawlee-python/issues/202), [#204](https://github.com/apify/crawlee-python/issues/204) +- Graceful exit in BasicCrawler.run ([#224](https://github.com/apify/crawlee-python/pull/224)) ([337286e](https://github.com/apify/crawlee-python/commit/337286e1b721cf61f57bc0ff3ead08df1f4f5448)) by [@janbuchar](https://github.com/janbuchar), closes [#212](https://github.com/apify/crawlee-python/issues/212) + + +## [0.0.5](https://github.com/apify/crawlee-python/releases/tag/v0.0.5) (2024-06-21) + +### ๐Ÿš€ Features + +- Browser rotation and better browser abstraction ([#177](https://github.com/apify/crawlee-python/pull/177)) ([a42ae6f](https://github.com/apify/crawlee-python/commit/a42ae6f53c5e24678f04011c3684290b68684016)) by [@vdusek](https://github.com/vdusek), closes [#131](https://github.com/apify/crawlee-python/issues/131) +- Add emit persist state event to event manager ([#181](https://github.com/apify/crawlee-python/pull/181)) ([97f6c68](https://github.com/apify/crawlee-python/commit/97f6c68275b65f76c62b6d16d94354fc7f00d336)) by [@vdusek](https://github.com/vdusek) +- Batched request addition in RequestQueue ([#186](https://github.com/apify/crawlee-python/pull/186)) ([f48c806](https://github.com/apify/crawlee-python/commit/f48c8068fe16ce3dd4c46fc248733346c0621411)) by [@vdusek](https://github.com/vdusek) +- Add storage helpers to crawler & context ([#192](https://github.com/apify/crawlee-python/pull/192)) ([f8f4066](https://github.com/apify/crawlee-python/commit/f8f4066d8b32d6e7dc0d999a5aa8db75f99b43b8)) by [@vdusek](https://github.com/vdusek), closes [#98](https://github.com/apify/crawlee-python/issues/98), [#100](https://github.com/apify/crawlee-python/issues/100), [#172](https://github.com/apify/crawlee-python/issues/172) +- Handle all supported configuration options ([#199](https://github.com/apify/crawlee-python/pull/199)) ([23c901c](https://github.com/apify/crawlee-python/commit/23c901cd68cf14b4041ee03568622ee32822e94b)) by [@janbuchar](https://github.com/janbuchar), closes [#84](https://github.com/apify/crawlee-python/issues/84) +- Add Playwright's enqueue links helper ([#196](https://github.com/apify/crawlee-python/pull/196)) ([849d73c](https://github.com/apify/crawlee-python/commit/849d73cc7d137171b98f9f2ab85374e8beec0dad)) by [@vdusek](https://github.com/vdusek) + +### ๐Ÿ› Bug Fixes + +- Tmp path in tests is working ([#164](https://github.com/apify/crawlee-python/pull/164)) ([382b6f4](https://github.com/apify/crawlee-python/commit/382b6f48174bdac3931cc379eaf770ab06f826dc)) by [@vdusek](https://github.com/vdusek), closes [#159](https://github.com/apify/crawlee-python/issues/159) +- Add explicit err msgs for missing pckg extras during import ([#165](https://github.com/apify/crawlee-python/pull/165)) ([200ebfa](https://github.com/apify/crawlee-python/commit/200ebfa63d6e20e17c8ca29544ef7229ed0df308)) by [@vdusek](https://github.com/vdusek), closes [#155](https://github.com/apify/crawlee-python/issues/155) +- Make timedelta_ms accept string-encoded numbers ([#190](https://github.com/apify/crawlee-python/pull/190)) ([d8426ff](https://github.com/apify/crawlee-python/commit/d8426ff41e36f701af459ad17552fee39637674d)) by [@janbuchar](https://github.com/janbuchar) +- **deps:** Update dependency psutil to v6 ([#193](https://github.com/apify/crawlee-python/pull/193)) ([eb91f51](https://github.com/apify/crawlee-python/commit/eb91f51e19da406e3f9293e5336c1f85fc7885a4)) by [@renovate[bot]](https://github.com/renovate[bot]) +- Improve compatibility between ProxyConfiguration and its SDK counterpart ([#201](https://github.com/apify/crawlee-python/pull/201)) ([1a76124](https://github.com/apify/crawlee-python/commit/1a76124080d561e0153a4dda0bdb0d9863c3aab6)) by [@janbuchar](https://github.com/janbuchar) +- Correct return type of storage get_info methods ([#200](https://github.com/apify/crawlee-python/pull/200)) ([332673c](https://github.com/apify/crawlee-python/commit/332673c4fb519b80846df7fb8cd8bb521538a8a4)) by [@janbuchar](https://github.com/janbuchar) +- Type error in statistics persist state ([#206](https://github.com/apify/crawlee-python/pull/206)) ([96ceef6](https://github.com/apify/crawlee-python/commit/96ceef697769cd57bd1a50b6615cf1e70549bd2d)) by [@vdusek](https://github.com/vdusek), closes [#194](https://github.com/apify/crawlee-python/issues/194) + + +## [0.0.4](https://github.com/apify/crawlee-python/releases/tag/v0.0.4) (2024-05-30) + +### ๐Ÿš€ Features + +- Capture statistics about the crawler run ([#142](https://github.com/apify/crawlee-python/pull/142)) ([eeebe9b](https://github.com/apify/crawlee-python/commit/eeebe9b1e24338d68a0a55228bbfc717f4d9d295)) by [@janbuchar](https://github.com/janbuchar), closes [#97](https://github.com/apify/crawlee-python/issues/97) +- Proxy configuration ([#156](https://github.com/apify/crawlee-python/pull/156)) ([5c3753a](https://github.com/apify/crawlee-python/commit/5c3753a5527b1d01f7260b9e4c566e43f956a5e8)) by [@janbuchar](https://github.com/janbuchar), closes [#136](https://github.com/apify/crawlee-python/issues/136) +- Add first version of browser pool and playwright crawler ([#161](https://github.com/apify/crawlee-python/pull/161)) ([2d2a050](https://github.com/apify/crawlee-python/commit/2d2a0505b1c2b1529a8835163ca97d1ec2a6e44a)) by [@vdusek](https://github.com/vdusek) + + +## [0.0.3](https://github.com/apify/crawlee-python/releases/tag/v0.0.3) (2024-05-13) + +### ๐Ÿš€ Features + +- AutoscaledPool implementation ([#55](https://github.com/apify/crawlee-python/pull/55)) ([621ada2](https://github.com/apify/crawlee-python/commit/621ada2bd1ba4e2346fb948dc02686e2b37e3856)) by [@janbuchar](https://github.com/janbuchar), closes [#19](https://github.com/apify/crawlee-python/issues/19) +- Add Snapshotter ([#20](https://github.com/apify/crawlee-python/pull/20)) ([492ee38](https://github.com/apify/crawlee-python/commit/492ee38c893b8f54e9583dd492576c5106e29881)) by [@vdusek](https://github.com/vdusek) +- Implement BasicCrawler ([#56](https://github.com/apify/crawlee-python/pull/56)) ([6da971f](https://github.com/apify/crawlee-python/commit/6da971fcddbf8b6795346c88e295dada28e7b1d3)) by [@janbuchar](https://github.com/janbuchar), closes [#30](https://github.com/apify/crawlee-python/issues/30) +- BeautifulSoupCrawler ([#107](https://github.com/apify/crawlee-python/pull/107)) ([4974dfa](https://github.com/apify/crawlee-python/commit/4974dfa20c7911ee073438fd388e60ba4b2c07db)) by [@janbuchar](https://github.com/janbuchar), closes [#31](https://github.com/apify/crawlee-python/issues/31) +- Add_requests and enqueue_links context helpers ([#120](https://github.com/apify/crawlee-python/pull/120)) ([dc850a5](https://github.com/apify/crawlee-python/commit/dc850a5778b105ff09e19eaecbb0a12d94798a62)) by [@janbuchar](https://github.com/janbuchar), closes [#5](https://github.com/apify/crawlee-python/issues/5) +- Use SessionPool in BasicCrawler ([#128](https://github.com/apify/crawlee-python/pull/128)) ([9fc4648](https://github.com/apify/crawlee-python/commit/9fc464837e596b3b5a7cd818b6d617550e249352)) by [@janbuchar](https://github.com/janbuchar), closes [#110](https://github.com/apify/crawlee-python/issues/110) +- Add base storage client and resource subclients ([#138](https://github.com/apify/crawlee-python/pull/138)) ([44d6597](https://github.com/apify/crawlee-python/commit/44d65974e4837576918069d7e63f8b804964971a)) by [@vdusek](https://github.com/vdusek) + +### ๐Ÿ› Bug Fixes + +- **deps:** Update dependency docutils to ^0.21.0 ([#101](https://github.com/apify/crawlee-python/pull/101)) ([534b613](https://github.com/apify/crawlee-python/commit/534b613f7cdfe7adf38b548ee48537db3167d1ec)) by [@renovate[bot]](https://github.com/renovate[bot]) +- **deps:** Update dependency eval-type-backport to ^0.2.0 ([#124](https://github.com/apify/crawlee-python/pull/124)) ([c9e69a8](https://github.com/apify/crawlee-python/commit/c9e69a8534f4d82d9a6314947d76a86bcb744607)) by [@renovate[bot]](https://github.com/renovate[bot]) +- Fire local SystemInfo events every second ([#144](https://github.com/apify/crawlee-python/pull/144)) ([f1359fa](https://github.com/apify/crawlee-python/commit/f1359fa7eea23f8153ad711287c073e45d498401)) by [@vdusek](https://github.com/vdusek) +- Storage manager & purging the defaults ([#150](https://github.com/apify/crawlee-python/pull/150)) ([851042f](https://github.com/apify/crawlee-python/commit/851042f25ad07e25651768e476f098ef0ed21914)) by [@vdusek](https://github.com/vdusek) + + +<!-- generated by git-cliff --> \ No newline at end of file diff --git a/website/versioned_docs/version-1.6/deployment/apify_platform.mdx b/website/versioned_docs/version-1.6/deployment/apify_platform.mdx new file mode 100644 index 0000000000..fc09127ea9 --- /dev/null +++ b/website/versioned_docs/version-1.6/deployment/apify_platform.mdx @@ -0,0 +1,253 @@ +--- +id: apify-platform +title: Apify platform +description: Apify platform - large-scale and high-performance web scraping +--- + +import ApiLink from '@site/src/components/ApiLink'; + +import CodeBlock from '@theme/CodeBlock'; + +import LogWithConfigExample from '!!raw-loader!./code_examples/apify/log_with_config_example.py'; +import CrawlerAsActorExample from '!!raw-loader!./code_examples/apify/crawler_as_actor_example.py'; +import ProxyExample from '!!raw-loader!./code_examples/apify/proxy_example.py'; +import ProxyAdvancedExample from '!!raw-loader!./code_examples/apify/proxy_advanced_example.py'; + +Apify is a [platform](https://apify.com) built to serve large-scale and high-performance web scraping and automation needs. It provides easy access to [compute instances (Actors)](#what-is-an-actor), convenient request and result storages, [proxies](../guides/proxy-management), scheduling, webhooks and [more](https://docs.apify.com/), accessible through a [web interface](https://console.apify.com) or an [API](https://docs.apify.com/api). + +While we think that the Apify platform is super cool, and it's definitely worth signing up for a [free account](https://console.apify.com/sign-up), **Crawlee is and will always be open source**, runnable locally or on any cloud infrastructure. + +:::note + +We do not test Crawlee in other cloud environments such as Lambda or on specific architectures such as Raspberry PI. We strive to make it work, but there are no guarantees. + +::: + +## Requirements + +To run your Crawlee code on Apify platform, you need an Apify account. If you don't have one yet, you can sign up [here](https://console.apify.com/sign-up). + +Additionally, you must have the [Apify CLI](https://docs.apify.com/cli/) installed on your computer. For installation instructions, refer to the [Installation guide](https://docs.apify.com/cli/docs/installation). + +Finally, ensure that the [Apify SDK] (https://docs.apify.com/sdk/python/) is installed in your project. You can install it using `pip`: + +```bash +pip install apify +``` + +## Logging into Apify platform from Crawlee + +To access your [Apify account](https://console.apify.com/sign-up) from Crawlee, you must provide credentials - your [API token](https://console.apify.com/account?tab=integrations). You can do that either by utilizing [Apify CLI](https://docs.apify.com/cli/) or with environment variables. + +Once you provide credentials to your Apify CLI installation, you will be able to use all the Apify platform features, such as calling Actors, saving to cloud storages, using Apify proxies, setting up webhooks and so on. + +### Log in with CLI + +Apify CLI allows you to log in to your Apify account on your computer. If you then run your crawler using the CLI, your credentials will automatically be added. + +```bash +npm install -g apify-cli +apify login -t YOUR_API_TOKEN +``` + +### Log in with environment variables + +Alternatively, you can always provide credentials to your Actor by setting the [`APIFY_TOKEN`](#apify_token) environment variable to your API token. + +> There's also the [`APIFY_PROXY_PASSWORD`](#apify_proxy_password) +> environment variable. Actor automatically infers that from your token, but it can be useful +> when you need to access proxies from a different account than your token represents. + +### Log in with Configuration + +Another option is to use the [`Configuration`](https://docs.apify.com/sdk/python/reference/class/Configuration) instance and set your api token there. + +<CodeBlock className="language-python"> + {LogWithConfigExample} +</CodeBlock> + +## What is an Actor + +When you deploy your script to the Apify platform, it becomes an [Actor](https://apify.com/actors). An Actor is a serverless microservice that accepts an input and produces an output. It can run for a few seconds, hours or even infinitely. An Actor can perform anything from a simple action such as filling out a web form or sending an email, to complex operations such as crawling an entire website and removing duplicates from a large dataset. + +Actors can be shared in the [Apify Store](https://apify.com/store) so that other people can use them. But don't worry, if you share your Actor in the store and somebody uses it, it runs under their account, not yours. + +**Related links** + +- [Store of existing Actors](https://apify.com/store) +- [Documentation](https://docs.apify.com/actors) +- [View Actors in Apify Console](https://console.apify.com/actors) +- [API reference](https://apify.com/docs/api/v2#/reference/actors) + +## Running an Actor locally + +First let's create a boilerplate of the new Actor. You could use Apify CLI and just run: + +```bash +apify create my-hello-world +``` + +The CLI will prompt you to select a project boilerplate template - let's pick "Crawlee + BeautifulSoup". The tool will create a directory called `my-hello-world` with Python project files. You can run the Actor as follows: + +```bash +cd my-hello-world +apify run +``` + +## Running Crawlee code as an Actor + +For running Crawlee code as an Actor on [Apify platform](https://apify.com/actors) you need to wrap the body of the main function of your crawler with `async with Actor`. + +:::info NOTE +Adding `async with Actor` is the only important thing needed to run it on Apify platform as an Actor. It is needed to initialize your Actor (e.g. to set the correct storage implementation) and to correctly handle exiting the process. +::: + +Let's look at the `BeautifulSoupCrawler` example from the [Quick start](../quick-start) guide: + +<CodeBlock className="language-python"> + {CrawlerAsActorExample} +</CodeBlock> + +Note that you could also run your Actor (that is using Crawlee) locally with Apify CLI. You could start it via the following command in your project folder: + +```bash +apify run +``` + +## Deploying an Actor to Apify platform + +Now (assuming you are already logged in to your Apify account) you can easily deploy your code to the Apify platform by running: + +```bash +apify push +``` + +Your script will be uploaded to and built on the Apify platform so that it can be run there. For more information, view the +[Apify Actor](https://docs.apify.com/cli) documentation. + +## Usage on Apify platform + +You can also develop your Actor in an online code editor directly on the platform (you'll need an Apify Account). Let's go to the [Actors](https://console.apify.com/actors) page in the app, click *Create new* and then go to the *Source* tab and start writing the code or paste one of the examples from the [Examples](../examples) section. + +## Storages + +There are several things worth mentioning here. + +### Helper functions for default Key-Value Store and Dataset + +To simplify access to the _default_ storages, instead of using the helper functions of respective storage classes, you could use: +- [`Actor.set_value()`](https://docs.apify.com/sdk/python/reference/class/Actor#set_value), [`Actor.get_value()`](https://docs.apify.com/sdk/python/reference/class/Actor#get_value), [`Actor.get_input()`](https://docs.apify.com/sdk/python/reference/class/Actor#get_input) for [`Key-Value Store`](https://docs.apify.com/sdk/python/reference/class/KeyValueStore) +- [`Actor.push_data()`](https://docs.apify.com/sdk/python/reference/class/Actor#push_data) for [`Dataset`](https://docs.apify.com/sdk/python/reference/class/Dataset) + +### Using platform storage in a local Actor + +When you plan to use the platform storage while developing and running your Actor locally, you should use [`Actor.open_key_value_store()`](https://docs.apify.com/sdk/python/reference/class/Actor#open_key_value_store), [`Actor.open_dataset()`](https://docs.apify.com/sdk/python/reference/class/Actor#open_dataset) and [`Actor.open_request_queue()`](https://docs.apify.com/sdk/python/reference/class/Actor#open_request_queue) to open the respective storage. + +Using each of these methods allows to pass the `force_cloud` keyword argument. If set to `True`, cloud storage will be used instead of the folder on the local disk. + +:::note +If you don't plan to force usage of the platform storages when running the Actor locally, there is no need to use the [`Actor`](https://docs.apify.com/sdk/python/reference/class/Actor) class for it. The Crawlee variants <ApiLink to="class/KeyValueStore#open">`KeyValueStore.open()`</ApiLink>, <ApiLink to="class/Dataset#open">`Dataset.open()`</ApiLink> and <ApiLink to="class/RequestQueue#open">`RequestQueue.open()`</ApiLink> will work the same. +::: + +{/* +### Getting public url of an item in the platform storage + +If you need to share a link to some file stored in a [Key-Value](https://docs.apify.com/sdk/python/reference/class/KeyValueStore) Store on Apify platform, you can use [`get_public_url()`](https://docs.apify.com/sdk/python/reference/class/KeyValueStore#get_public_url) method. It accepts only one parameter: `key` - the key of the item you want to share. + +<CodeBlock language="python"> + {GetPublicUrlSource} +</CodeBlock> + +*/} + +### Exporting dataset data + +When the <ApiLink to="class/Dataset">`Dataset`</ApiLink> is stored on the [Apify platform](https://apify.com/actors), you can export its data to the following formats: HTML, JSON, CSV, Excel, XML and RSS. The datasets are displayed on the Actor run details page and in the [Storage](https://console.apify.com/storage) section in the Apify Console. The actual data is exported using the [Get dataset items](https://apify.com/docs/api/v2#/reference/datasets/item-collection/get-items) Apify API endpoint. This way you can easily share the crawling results. + +**Related links** + +- [Apify platform storage documentation](https://docs.apify.com/storage) +- [View storage in Apify Console](https://console.apify.com/storage) +- [Key-value stores API reference](https://apify.com/docs/api/v2#/reference/key-value-stores) +- [Datasets API reference](https://docs.apify.com/api/v2#/reference/datasets) +- [Request queues API reference](https://docs.apify.com/api/v2#/reference/request-queues) + +## Environment variables + +The following describes select environment variables set by the Apify platform. For a complete list, see the [Environment variables](https://docs.apify.com/platform/actors/development/programming-interface/environment-variables) section in the Apify platform documentation. + +:::note + +It's important to notice that `CRAWLEE_` environment variables don't need to be replaced with equivalent `APIFY_` ones. Likewise, Crawlee understands `APIFY_` environment variables. + +::: + +### `APIFY_TOKEN` + +The API token for your Apify account. It is used to access the Apify API, e.g. to access cloud storage +or to run an Actor on the Apify platform. You can find your API token on the +[Account Settings / Integrations](https://console.apify.com/account?tab=integrations) page. + +### Combinations of `APIFY_TOKEN` and `CRAWLEE_STORAGE_DIR` + +By combining the env vars in various ways, you can greatly influence the Actor's behavior. + +| Env Vars | API | Storages | +| --------------------------------------- | --- | ---------------- | +| none OR `CRAWLEE_STORAGE_DIR` | no | local | +| `APIFY_TOKEN` | yes | Apify platform | +| `APIFY_TOKEN` AND `CRAWLEE_STORAGE_DIR` | yes | local + platform | + +When using both `APIFY_TOKEN` and `CRAWLEE_STORAGE_DIR`, you can use all the Apify platform +features and your data will be stored locally by default. If you want to access platform storages, +you can use the `force_cloud=true` option in their respective functions. + +### `APIFY_PROXY_PASSWORD` + +Optional password to [Apify Proxy](https://docs.apify.com/proxy) for IP address rotation. +Assuming Apify Account was already created, you can find the password on the [Proxy page](https://console.apify.com/proxy) +in the Apify Console. The password is automatically inferred using the `APIFY_TOKEN` env var, +so in most cases, you don't need to touch it. You should use it when, for some reason, +you need access to Apify Proxy, but not access to Apify API, or when you need access to +proxy from a different account than your token represents. + +## Proxy management + +In addition to your own proxy servers and proxy servers acquired from +third-party providers used together with Crawlee, you can also rely on [Apify Proxy](https://apify.com/proxy) +for your scraping needs. + +### Apify proxy + +If you are already subscribed to Apify Proxy, you can start using them immediately in only a few lines of code (for local usage you first should be [logged in](#logging-into-apify-platform-from-crawlee) to your Apify account. + +<CodeBlock className="language-python"> + {ProxyExample} +</CodeBlock> + +Note that unlike using your own proxies in Crawlee, you shouldn't use the constructor to create <ApiLink to="class/ProxyConfiguration">`ProxyConfiguration`</ApiLink> instances. For using the Apify Proxy you should create an instance using the [`Actor.create_proxy_configuration()`](https://docs.apify.com/sdk/python/reference/class/Actor#create_proxy_configuration) function instead. + +### Advanced Apify proxy configuration + +With Apify Proxy, you can select specific proxy groups to use, or countries to connect from. +This allows you to get better proxy performance after some initial research. + +<CodeBlock className="language-python"> + {ProxyAdvancedExample} +</CodeBlock> + +Now your crawlers will use only Residential proxies from the US. Note that you must first get access +to a proxy group before you are able to use it. You can check proxy groups available to you +in the [proxy dashboard](https://console.apify.com/proxy). + +### Apify proxy vs. own proxies + +The [`ProxyConfiguration`](https://docs.apify.com/sdk/python/reference/class/ProxyConfiguration) class covers both Apify Proxy and custom proxy URLs so that you can easily switch between proxy providers. However, some features of the class are available only to Apify Proxy users, mainly because Apify Proxy is what one would call a super-proxy. It's not a single proxy server, but an API endpoint that allows connection through millions of different IP addresses. So the class essentially has two modes: Apify Proxy or Own (third party) proxy. + +The difference is easy to remember. +- If you're using your own proxies - you should create a <ApiLink to="class/ProxyConfiguration">`ProxyConfiguration`</ApiLink> instance directly. +- If you are planning to use Apify Proxy - you should create an instance using the [`Actor.create_proxy_configuration()`](https://docs.apify.com/sdk/python/reference/class/Actor#create_proxy_configuration) function. The `new_url_function` parameter enables the use of your custom proxy URLs, whereas all the other options are there to configure Apify Proxy. + +**Related links** + +- [Apify Proxy docs](https://docs.apify.com/proxy) diff --git a/website/versioned_docs/version-1.6/deployment/aws_lambda.mdx b/website/versioned_docs/version-1.6/deployment/aws_lambda.mdx new file mode 100644 index 0000000000..cbe49c9ff0 --- /dev/null +++ b/website/versioned_docs/version-1.6/deployment/aws_lambda.mdx @@ -0,0 +1,190 @@ +--- +id: aws-lambda +title: Deploy on AWS Lambda +description: Prepare your crawler to run on AWS Lambda. +--- + +import ApiLink from '@site/src/components/ApiLink'; + +import CodeBlock from '@theme/CodeBlock'; + +import BeautifulSoupCrawlerLambda from '!!raw-loader!./code_examples/aws/beautifulsoup_crawler_lambda.py'; +import PlaywrightCrawlerLambda from '!!raw-loader!./code_examples/aws/playwright_crawler_lambda.py'; +import PlaywrightCrawlerDockerfile from '!!raw-loader!./code_examples/aws/playwright_dockerfile'; + +[AWS Lambda](https://docs.aws.amazon.com/lambda/latest/dg/welcome.html) is a serverless compute service that lets you run code without provisioning or managing servers. This guide covers deploying <ApiLink to="class/BeautifulSoupCrawler">`BeautifulSoupCrawler`</ApiLink> and <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink>. + +The code examples are based on the [BeautifulSoupCrawler example](../examples/beautifulsoup-crawler). + +## BeautifulSoupCrawler on AWS Lambda + +For simple crawlers that don't require browser rendering, you can deploy using a ZIP archive. + +### Updating the code + +When instantiating a crawler, use <ApiLink to="class/MemoryStorageClient">`MemoryStorageClient`</ApiLink>. By default, Crawlee uses file-based storage, but the Lambda filesystem is read-only (except for `/tmp`). Using `MemoryStorageClient` tells Crawlee to use in-memory storage instead. + +Wrap the crawler logic in a `lambda_handler` function. This is the entry point that AWS will execute. + +:::important + +Make sure to always instantiate a new crawler for every Lambda invocation. AWS keeps the environment running for some time after the first execution (to reduce cold-start times), so subsequent calls may access an already-used crawler instance. + +**TL;DR: Keep your Lambda stateless.** + +::: + +Finally, return the scraped data from the Lambda when the crawler run ends. + +<CodeBlock language="python" title="lambda_function.py"> + {BeautifulSoupCrawlerLambda} +</CodeBlock> + +### Preparing the environment + +Lambda requires all dependencies to be included in the deployment package. Create a virtual environment and install dependencies: + +```bash +python3.14 -m venv .venv +source .venv/bin/activate +pip install 'crawlee[beautifulsoup]' 'boto3' 'aws-lambda-powertools' +``` + +[`boto3`](https://boto3.amazonaws.com/v1/documentation/api/latest/index.html) is the AWS SDK for Python. Including it in your dependencies is recommended to avoid version misalignment issues with the Lambda runtime. + +### Creating the ZIP archive + +Create a ZIP archive from your project, including dependencies from the virtual environment: + +```bash +cd .venv/lib/python3.14/site-packages +zip -r ../../../../package.zip . +cd ../../../../ +zip package.zip lambda_function.py +``` + +:::note Large dependencies? + +AWS has a limit of 50 MB for direct upload and 250 MB for unzipped deployment package size. + +A better way to manage dependencies is by using Lambda Layers. With Layers, you can share files between multiple Lambda functions and keep the actual code as slim as possible. + +To create a Lambda Layer: + +1. Create a `python/` folder and copy dependencies from `site-packages` into it +2. Create a zip archive: `zip -r layer.zip python/` +3. Create a new Lambda Layer from the archive (you may need to upload it to S3 first) +4. Attach the Layer to your Lambda function + +::: + +### Creating the Lambda function + +Create the Lambda function in the AWS Lambda Console: + +1. Navigate to `Lambda` in [AWS Management Console](https://aws.amazon.com/console/). +2. Click **Create function**. +3. Select **Author from scratch**. +4. Enter a **Function name**, for example `BeautifulSoupTest`. +5. Choose a **Python runtime** that matches the version used in your virtual environment (for example, Python 3.14). +6. Click **Create function** to finish. + +Once created, upload `package.zip` as the code source in the AWS Lambda Console using the "Upload from" button. + +In Lambda Runtime Settings, set the handler. Since the file is named `lambda_function.py` and the function is `lambda_handler`, you can use the default value `lambda_function.lambda_handler`. + +:::tip Configuration + +In the Configuration tab, you can adjust: + +- **Memory**: Memory size can greatly affect execution speed. A minimum of 256-512 MB is recommended. +- **Timeout**: Set according to the size of the website you are scraping (1 minute for the example code). +- **Ephemeral storage**: Size of the `/tmp` directory. + +See the [official documentation](https://docs.aws.amazon.com/lambda/latest/dg/gettingstarted-limits.html) to learn how performance and cost scale with memory. + +::: + +After the Lambda deploys, you can test it by clicking the "Test" button. The event contents don't matter for a basic test, but you can parameterize your crawler by parsing the event object that AWS passes as the first argument to the handler. + +## PlaywrightCrawler on AWS Lambda + +For crawlers that require browser rendering, you need to deploy using Docker container images because Playwright and browser binaries exceed Lambda's ZIP deployment size limits. + +### Updating the code + +As with <ApiLink to="class/BeautifulSoupCrawler">`BeautifulSoupCrawler`</ApiLink>, use <ApiLink to="class/MemoryStorageClient">`MemoryStorageClient`</ApiLink> and wrap the logic in a `lambda_handler` function. Additionally, configure `browser_launch_options` with flags optimized for serverless environments. These flags disable sandboxing and GPU features that aren't available in Lambda's containerized runtime. + +<CodeBlock language="python" title="main.py"> + {PlaywrightCrawlerLambda} +</CodeBlock> + +### Installing and configuring AWS CLI + +Install AWS CLI following the [official documentation](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html) according to your operating system. + +Authenticate by running: + +```bash +aws login +``` + +### Preparing the project + +Initialize the project by running `uvx 'crawlee[cli]' create`. + +Or use a single command if you don't need interactive mode: + +```bash +uvx 'crawlee[cli]' create aws_playwright --crawler-type playwright --http-client impit --package-manager uv --no-apify --start-url 'https://crawlee.dev' --install +``` + +Add the following dependencies: + +```bash +uv add awslambdaric aws-lambda-powertools boto3 +``` + +[`boto3`](https://boto3.amazonaws.com/v1/documentation/api/latest/index.html) is the AWS SDK for Python. Use it if your function integrates with any other AWS services. + +The project is created with a Dockerfile that needs to be modified for AWS Lambda by adding `ENTRYPOINT` and updating `CMD`: + +<CodeBlock language="dockerfile" title="Dockerfile"> + {PlaywrightCrawlerDockerfile} +</CodeBlock> + +### Building and pushing the Docker image + +Create a repository `lambda/aws-playwright` in [Amazon Elastic Container Registry](https://docs.aws.amazon.com/AmazonECR/latest/userguide/what-is-ecr.html) in the same region where your Lambda functions will run. To learn more, refer to the [official documentation](https://docs.aws.amazon.com/AmazonECR/latest/userguide/getting-started-cli.html). + +Navigate to the created repository and click the "View push commands" button. This will open a window with console commands for uploading the Docker image to your repository. Execute them. + +Example: +```bash +aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin {user-specific-data} +docker build --platform linux/amd64 --provenance=false -t lambda/aws-playwright . +docker tag lambda/aws-playwright:latest {user-specific-data}/lambda/aws-playwright:latest +docker push {user-specific-data}/lambda/aws-playwright:latest +``` + +### Creating the Lambda function + +1. Navigate to `Lambda` in [AWS Management Console](https://aws.amazon.com/console/). +2. Click **Create function**. +3. Select **Container image**. +4. Browse and select your ECR image. +5. Click **Create function** to finish. + +:::tip Configuration + +In the Configuration tab, you can adjust resources. Playwright crawlers require more resources than BeautifulSoup crawlers: + +- **Memory**: Minimum 1024 MB recommended. Browser operations are memory-intensive, so 2048 MB or more may be needed for complex pages. +- **Timeout**: Set according to crawl size. Browser startup adds overhead, so allow at least 5 minutes even for simple crawls. +- **Ephemeral storage**: Default 512 MB is usually sufficient unless downloading large files. + +See the [official documentation](https://docs.aws.amazon.com/lambda/latest/dg/gettingstarted-limits.html) to learn how performance and cost scale with memory. + +::: + +After the Lambda deploys, click the "Test" button to invoke it. The event contents don't matter for a basic test, but you can parameterize your crawler by parsing the event object that AWS passes as the first argument to the handler. diff --git a/website/versioned_docs/version-1.6/deployment/code_examples/apify/crawler_as_actor_example.py b/website/versioned_docs/version-1.6/deployment/code_examples/apify/crawler_as_actor_example.py new file mode 100644 index 0000000000..53527d555b --- /dev/null +++ b/website/versioned_docs/version-1.6/deployment/code_examples/apify/crawler_as_actor_example.py @@ -0,0 +1,27 @@ +import asyncio + +from apify import Actor + +from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext + + +async def main() -> None: + # Wrap the crawler code in an Actor context manager. + async with Actor: + crawler = BeautifulSoupCrawler(max_requests_per_crawl=10) + + @crawler.router.default_handler + async def request_handler(context: BeautifulSoupCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + data = { + 'url': context.request.url, + 'title': context.soup.title.string if context.soup.title else None, + } + await context.push_data(data) + await context.enqueue_links() + + await crawler.run(['https://crawlee.dev']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/deployment/code_examples/apify/get_public_url.py b/website/versioned_docs/version-1.6/deployment/code_examples/apify/get_public_url.py new file mode 100644 index 0000000000..d12cfba300 --- /dev/null +++ b/website/versioned_docs/version-1.6/deployment/code_examples/apify/get_public_url.py @@ -0,0 +1,16 @@ +import asyncio + +from apify import Actor + + +async def main() -> None: + async with Actor: + store = await Actor.open_key_value_store() + await store.set_value('your-file', {'foo': 'bar'}) + url = store.get_public_url('your-file') + Actor.log.info(f'KVS public URL: {url}') + # https://api.apify.com/v2/key-value-stores/<your-store-id>/records/your-file + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/deployment/code_examples/apify/log_with_config_example.py b/website/versioned_docs/version-1.6/deployment/code_examples/apify/log_with_config_example.py new file mode 100644 index 0000000000..dfefa7b5ae --- /dev/null +++ b/website/versioned_docs/version-1.6/deployment/code_examples/apify/log_with_config_example.py @@ -0,0 +1,19 @@ +import asyncio + +from apify import Actor, Configuration + + +async def main() -> None: + # Create a new configuration with your API key. You can find it at + # https://console.apify.com/settings/integrations. It can be provided either + # as a parameter "token" or as an environment variable "APIFY_TOKEN". + config = Configuration( + token='apify_api_YOUR_TOKEN', + ) + + async with Actor(config): + Actor.log.info('Hello from Apify platform!') + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/deployment/code_examples/apify/proxy_advanced_example.py b/website/versioned_docs/version-1.6/deployment/code_examples/apify/proxy_advanced_example.py new file mode 100644 index 0000000000..1b5306bd39 --- /dev/null +++ b/website/versioned_docs/version-1.6/deployment/code_examples/apify/proxy_advanced_example.py @@ -0,0 +1,20 @@ +import asyncio + +from apify import Actor + + +async def main() -> None: + async with Actor: + proxy_configuration = await Actor.create_proxy_configuration( + password='apify_proxy_YOUR_PASSWORD', + # Specify the proxy group to use. + groups=['RESIDENTIAL'], + # Set the country code for the proxy. + country_code='US', + ) + + # ... + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/deployment/code_examples/apify/proxy_example.py b/website/versioned_docs/version-1.6/deployment/code_examples/apify/proxy_example.py new file mode 100644 index 0000000000..d546c5cc45 --- /dev/null +++ b/website/versioned_docs/version-1.6/deployment/code_examples/apify/proxy_example.py @@ -0,0 +1,24 @@ +import asyncio + +from apify import Actor + + +async def main() -> None: + async with Actor: + # Create a new Apify Proxy configuration. The password can be found at + # https://console.apify.com/proxy/http-settings and should be provided either + # as a parameter "password" or as an environment variable "APIFY_PROXY_PASSWORD". + proxy_configuration = await Actor.create_proxy_configuration( + password='apify_proxy_YOUR_PASSWORD', + ) + + if not proxy_configuration: + Actor.log.warning('Failed to create proxy configuration.') + return + + proxy_url = await proxy_configuration.new_url() + Actor.log.info(f'Proxy URL: {proxy_url}') + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/deployment/code_examples/aws/beautifulsoup_crawler_lambda.py b/website/versioned_docs/version-1.6/deployment/code_examples/aws/beautifulsoup_crawler_lambda.py new file mode 100644 index 0000000000..3fb8bfe3b1 --- /dev/null +++ b/website/versioned_docs/version-1.6/deployment/code_examples/aws/beautifulsoup_crawler_lambda.py @@ -0,0 +1,61 @@ +import asyncio +import json +from datetime import timedelta +from typing import Any + +from aws_lambda_powertools.utilities.typing import LambdaContext + +from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext +from crawlee.storage_clients import MemoryStorageClient +from crawlee.storages import Dataset, RequestQueue + + +async def main() -> str: + # highlight-start + # Disable writing storage data to the file system + storage_client = MemoryStorageClient() + # highlight-end + + # Initialize storages + dataset = await Dataset.open(storage_client=storage_client) + request_queue = await RequestQueue.open(storage_client=storage_client) + + crawler = BeautifulSoupCrawler( + storage_client=storage_client, + max_request_retries=1, + request_handler_timeout=timedelta(seconds=30), + max_requests_per_crawl=10, + ) + + @crawler.router.default_handler + async def request_handler(context: BeautifulSoupCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + data = { + 'url': context.request.url, + 'title': context.soup.title.string if context.soup.title else None, + 'h1s': [h1.text for h1 in context.soup.find_all('h1')], + 'h2s': [h2.text for h2 in context.soup.find_all('h2')], + 'h3s': [h3.text for h3 in context.soup.find_all('h3')], + } + + await context.push_data(data) + await context.enqueue_links() + + await crawler.run(['https://crawlee.dev']) + + # Extract data saved in `Dataset` + data = await crawler.get_data() + + # Clean up storages after the crawl + await dataset.drop() + await request_queue.drop() + + # Serialize the list of scraped items to JSON string + return json.dumps(data.items) + + +def lambda_handler(_event: dict[str, Any], _context: LambdaContext) -> dict[str, Any]: + result = asyncio.run(main()) + # Return the response with results + return {'statusCode': 200, 'body': result} diff --git a/website/versioned_docs/version-1.6/deployment/code_examples/aws/playwright_crawler_lambda.py b/website/versioned_docs/version-1.6/deployment/code_examples/aws/playwright_crawler_lambda.py new file mode 100644 index 0000000000..d1c831ef51 --- /dev/null +++ b/website/versioned_docs/version-1.6/deployment/code_examples/aws/playwright_crawler_lambda.py @@ -0,0 +1,73 @@ +import asyncio +import json +from datetime import timedelta +from typing import Any + +from aws_lambda_powertools.utilities.typing import LambdaContext + +from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext +from crawlee.storage_clients import MemoryStorageClient +from crawlee.storages import Dataset, RequestQueue + + +async def main() -> str: + # highlight-start + # Disable writing storage data to the file system + storage_client = MemoryStorageClient() + # highlight-end + + # Initialize storages + dataset = await Dataset.open(storage_client=storage_client) + request_queue = await RequestQueue.open(storage_client=storage_client) + + crawler = PlaywrightCrawler( + storage_client=storage_client, + max_request_retries=1, + request_handler_timeout=timedelta(seconds=30), + max_requests_per_crawl=10, + # highlight-start + # Configure Playwright to run in AWS Lambda environment + browser_launch_options={ + 'args': [ + '--no-sandbox', + '--disable-setuid-sandbox', + '--disable-dev-shm-usage', + '--disable-gpu', + '--single-process', + ] + }, + # highlight-end + ) + + @crawler.router.default_handler + async def request_handler(context: PlaywrightCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + data = { + 'url': context.request.url, + 'title': await context.page.title(), + 'h1s': await context.page.locator('h1').all_text_contents(), + 'h2s': await context.page.locator('h2').all_text_contents(), + 'h3s': await context.page.locator('h3').all_text_contents(), + } + + await context.push_data(data) + await context.enqueue_links() + + await crawler.run(['https://crawlee.dev']) + + # Extract data saved in `Dataset` + data = await crawler.get_data() + + # Clean up storages after the crawl + await dataset.drop() + await request_queue.drop() + + # Serialize the list of scraped items to JSON string + return json.dumps(data.items) + + +def lambda_handler(_event: dict[str, Any], _context: LambdaContext) -> dict[str, Any]: + result = asyncio.run(main()) + # Return the response with results + return {'statusCode': 200, 'body': result} diff --git a/website/versioned_docs/version-1.6/deployment/code_examples/aws/playwright_dockerfile b/website/versioned_docs/version-1.6/deployment/code_examples/aws/playwright_dockerfile new file mode 100644 index 0000000000..618587e55f --- /dev/null +++ b/website/versioned_docs/version-1.6/deployment/code_examples/aws/playwright_dockerfile @@ -0,0 +1,36 @@ +FROM apify/actor-python-playwright:3.14 + +RUN apt update && apt install -yq git && rm -rf /var/lib/apt/lists/* + +RUN pip install -U pip setuptools \ + && pip install 'uv<1' + +ENV UV_PROJECT_ENVIRONMENT="/usr/local" + +COPY pyproject.toml uv.lock ./ + +RUN echo "Python version:" \ + && python --version \ + && echo "Installing dependencies:" \ + && PLAYWRIGHT_INSTALLED=$(pip freeze | grep -q playwright && echo "true" || echo "false") \ + && if [ "$PLAYWRIGHT_INSTALLED" = "true" ]; then \ + echo "Playwright already installed, excluding from uv sync" \ + && uv sync --frozen --no-install-project --no-editable -q --no-dev --inexact --no-install-package playwright; \ + else \ + echo "Playwright not found, installing all dependencies" \ + && uv sync --frozen --no-install-project --no-editable -q --no-dev --inexact; \ + fi \ + && echo "All installed Python packages:" \ + && pip freeze + +COPY . ./ + +RUN python -m compileall -q . + +# highlight-start +# AWS Lambda entrypoint +ENTRYPOINT [ "/usr/local/bin/python3", "-m", "awslambdaric" ] + +# Lambda handler function +CMD [ "aws_playwright.main.lambda_handler" ] +# highlight-end diff --git a/website/versioned_docs/version-1.6/deployment/code_examples/google/cloud_run_example.py b/website/versioned_docs/version-1.6/deployment/code_examples/google/cloud_run_example.py new file mode 100644 index 0000000000..27d23b99eb --- /dev/null +++ b/website/versioned_docs/version-1.6/deployment/code_examples/google/cloud_run_example.py @@ -0,0 +1,53 @@ +import json +import os + +import uvicorn +from litestar import Litestar, get + +from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext +from crawlee.storage_clients import MemoryStorageClient + + +@get('/') +async def main() -> str: + """The crawler entry point that will be called when the HTTP endpoint is accessed.""" + # highlight-start + # Disable writing storage data to the file system + storage_client = MemoryStorageClient() + # highlight-end + + crawler = PlaywrightCrawler( + headless=True, + max_requests_per_crawl=10, + browser_type='firefox', + storage_client=storage_client, + ) + + @crawler.router.default_handler + async def default_handler(context: PlaywrightCrawlingContext) -> None: + """Default request handler that processes each page during crawling.""" + context.log.info(f'Processing {context.request.url} ...') + title = await context.page.query_selector('title') + await context.push_data( + { + 'url': context.request.loaded_url, + 'title': await title.inner_text() if title else None, + } + ) + + await context.enqueue_links() + + await crawler.run(['https://crawlee.dev']) + + data = await crawler.get_data() + + # Return the results as JSON to the client + return json.dumps(data.items) + + +# Initialize the Litestar app with our route handler +app = Litestar(route_handlers=[main]) + +# Start the Uvicorn server using the `PORT` environment variable provided by GCP +# This is crucial - Cloud Run expects your app to listen on this specific port +uvicorn.run(app, host='0.0.0.0', port=int(os.environ.get('PORT', '8080'))) # noqa: S104 # Use all interfaces in a container, safely diff --git a/website/versioned_docs/version-1.6/deployment/code_examples/google/google_example.py b/website/versioned_docs/version-1.6/deployment/code_examples/google/google_example.py new file mode 100644 index 0000000000..68deac804c --- /dev/null +++ b/website/versioned_docs/version-1.6/deployment/code_examples/google/google_example.py @@ -0,0 +1,57 @@ +import asyncio +import json +from datetime import timedelta + +import functions_framework +from flask import Request, Response + +from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext +from crawlee.storage_clients import MemoryStorageClient + + +async def main() -> str: + # highlight-start + # Disable writing storage data to the file system + storage_client = MemoryStorageClient() + # highlight-end + + crawler = BeautifulSoupCrawler( + storage_client=storage_client, + max_request_retries=1, + request_handler_timeout=timedelta(seconds=30), + max_requests_per_crawl=10, + ) + + @crawler.router.default_handler + async def request_handler(context: BeautifulSoupCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + data = { + 'url': context.request.url, + 'title': context.soup.title.string if context.soup.title else None, + 'h1s': [h1.text for h1 in context.soup.find_all('h1')], + 'h2s': [h2.text for h2 in context.soup.find_all('h2')], + 'h3s': [h3.text for h3 in context.soup.find_all('h3')], + } + + await context.push_data(data) + await context.enqueue_links() + + await crawler.run(['https://crawlee.dev']) + + # highlight-start + # Extract data saved in `Dataset` + data = await crawler.get_data() + # Serialize to json string and return + return json.dumps(data.items) + # highlight-end + + +@functions_framework.http +def crawlee_run(request: Request) -> Response: + # You can pass data to your crawler using `request` + function_id = request.headers['Function-Execution-Id'] + response_str = asyncio.run(main()) + + # Return a response with the crawling results + return Response(response=response_str, status=200) diff --git a/website/versioned_docs/version-1.6/deployment/google_cloud.mdx b/website/versioned_docs/version-1.6/deployment/google_cloud.mdx new file mode 100644 index 0000000000..e4f1fbe480 --- /dev/null +++ b/website/versioned_docs/version-1.6/deployment/google_cloud.mdx @@ -0,0 +1,45 @@ +--- +id: gcp-cloud-run-functions +title: Cloud Run functions +description: Prepare your crawler to run in Cloud Run functions on Google Cloud Platform. +--- + +import ApiLink from '@site/src/components/ApiLink'; + +import CodeBlock from '@theme/CodeBlock'; + +import GoogleFunctions from '!!raw-loader!./code_examples/google/google_example.py'; + +[Google Cloud Run Functions](https://cloud.google.com/functions) is a serverless execution environment for running simple HTTP-based web scrapers. This service is best suited for lightweight crawlers that don't require browser rendering capabilities and can be executed via HTTP requests. + +## Updating the project + +For the project foundation, use <ApiLink to="class/BeautifulSoupCrawler">BeautifulSoupCrawler</ApiLink> as described in this [example](../examples/beautifulsoup-crawler). + +Add [`functions-framework`](https://pypi.org/project/functions-framework/) to your dependencies file `requirements.txt`. If you're using a project manager like `poetry` or `uv`, export your dependencies to `requirements.txt`. + +Update the project code to make it compatible with Cloud Functions and return data in JSON format. Also add an entry point that Cloud Functions will use to run the project. + +<CodeBlock className="language-python"> + {GoogleFunctions.replace(/^.*?\n/, '')} +</CodeBlock> + +You can test your project locally. Start the server by running: + +```bash +functions-framework --target=crawlee_run +``` + +Then make a GET request to `http://127.0.0.1:8080/`, for example in your browser. + +## Deploying to Google Cloud Platform + +In the Google Cloud dashboard, create a new function, allocate memory and CPUs to it, set region and function timeout. + +When deploying, select **"Use an inline editor to create a function"**. This allows you to configure the project using only the Google Cloud Console dashboard. + +Using the `inline editor`, update the function files according to your project. **Make sure** to update the `requirements.txt` file to match your project's dependencies. + +Also, make sure to set the **Function entry point** to the name of the function decorated with `@functions_framework.http`, which in our case is `crawlee_run`. + +After the Function deploys, you can test it by clicking the "Test" button. This button opens a popup with a `curl` script that calls your new Cloud Function. To avoid having to install the `gcloud` CLI application locally, you can also run this script in the Cloud Shell by clicking the link above the code block. diff --git a/website/versioned_docs/version-1.6/deployment/google_cloud_run.mdx b/website/versioned_docs/version-1.6/deployment/google_cloud_run.mdx new file mode 100644 index 0000000000..c9aef10c3d --- /dev/null +++ b/website/versioned_docs/version-1.6/deployment/google_cloud_run.mdx @@ -0,0 +1,51 @@ +--- +id: gcp-cloud-run +title: Cloud Run +description: Prepare your crawler to run in Cloud Run on Google Cloud Platform. +--- + +import ApiLink from '@site/src/components/ApiLink'; + +import CodeBlock from '@theme/CodeBlock'; + +import GoogleCloudRun from '!!raw-loader!./code_examples/google/cloud_run_example.py'; + + +[Google Cloud Run](https://cloud.google.com/run) is a container-based serverless platform that allows you to run web crawlers with headless browsers. This service is recommended when your Crawlee applications need browser rendering capabilities, require more granular control, or have complex dependencies that aren't supported by [Cloud Functions](./gcp-cloud-run-functions). + +GCP Cloud Run allows you to deploy using Docker containers, giving you full control over your environment and the flexibility to use any web server framework of your choice, unlike Cloud Functions which are limited to [Flask](https://flask.palletsprojects.com/en/stable/). + +## Preparing the project + +We'll prepare our project using [Litestar](https://litestar.dev/) and the [Uvicorn](https://www.uvicorn.org/) web server. The HTTP server handler will wrap the crawler to communicate with clients. Because the Cloud Run platform sees only an opaque Docker container, we have to take care of this bit ourselves. + +:::info + +GCP passes you an environment variable called `PORT` - your HTTP server is expected to be listening on this port (GCP exposes this one to the outer world). + +::: + +<CodeBlock className="language-python"> + {GoogleCloudRun.replace(/^.*?\n/, '')} +</CodeBlock> + + +:::tip + +Always make sure to keep all the logic in the request handler - as with other FaaS services, your request handlers have to be **stateless.** + +::: + +## Deploying to Google Cloud Platform + +Now, weโ€™re ready to deploy! If you have initialized your project using `uvx crawlee create`, the initialization script has prepared a Dockerfile for you. + +All you have to do now is run `gcloud run deploy` in your project folder (the one with your Dockerfile in it). The gcloud CLI application will ask you a few questions, such as what region you want to deploy your application in, or whether you want to make your application public or private. + +After answering those questions, you should be able to see your application in the GCP dashboard and run it using the link you find there. + +:::tip + +In case your first execution of your newly created Cloud Run fails, try editing the Run configuration - mainly setting the available memory to 1GiB or more and updating the request timeout according to the size of the website you are scraping. + +::: diff --git a/website/versioned_docs/version-1.6/examples/add_data_to_dataset.mdx b/website/versioned_docs/version-1.6/examples/add_data_to_dataset.mdx new file mode 100644 index 0000000000..aa4164cacf --- /dev/null +++ b/website/versioned_docs/version-1.6/examples/add_data_to_dataset.mdx @@ -0,0 +1,40 @@ +--- +id: add-data-to-dataset +title: Add data to dataset +--- + +import ApiLink from '@site/src/components/ApiLink'; +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; + +import BeautifulSoupExample from '!!raw-loader!roa-loader!./code_examples/add_data_to_dataset_bs.py'; +import PlaywrightExample from '!!raw-loader!roa-loader!./code_examples/add_data_to_dataset_pw.py'; +import DatasetExample from '!!raw-loader!roa-loader!./code_examples/add_data_to_dataset_dataset.py'; + +This example demonstrates how to store extracted data into datasets using the <ApiLink to="class/PushDataFunction#open">`context.push_data`</ApiLink> helper function. If the specified dataset does not already exist, it will be created automatically. Additionally, you can save data to custom datasets by providing `dataset_id` or `dataset_name` parameters to the <ApiLink to="class/PushDataFunction#open">`push_data`</ApiLink> function. + +<Tabs groupId="main"> + <TabItem value="BeautifulSoupCrawler" label="BeautifulSoupCrawler"> + <RunnableCodeBlock className="language-python" language="python"> + {BeautifulSoupExample} + </RunnableCodeBlock> + </TabItem> + <TabItem value="PlaywrightCrawler" label="PlaywrightCrawler"> + <RunnableCodeBlock className="language-python" language="python"> + {PlaywrightExample} + </RunnableCodeBlock> + </TabItem> +</Tabs> + +Each item in the dataset will be stored in its own file within the following directory: + +```text +{PROJECT_FOLDER}/storage/datasets/default/ +``` + +For more control, you can also open a dataset manually using the asynchronous constructor <ApiLink to="class/Dataset#open">`Dataset.open`</ApiLink> + +<RunnableCodeBlock className="language-python" language="python"> + {DatasetExample} +</RunnableCodeBlock> diff --git a/website/versioned_docs/version-1.6/examples/beautifulsoup_crawler.mdx b/website/versioned_docs/version-1.6/examples/beautifulsoup_crawler.mdx new file mode 100644 index 0000000000..160e4c4d65 --- /dev/null +++ b/website/versioned_docs/version-1.6/examples/beautifulsoup_crawler.mdx @@ -0,0 +1,15 @@ +--- +id: beautifulsoup-crawler +title: BeautifulSoup crawler +--- + +import ApiLink from '@site/src/components/ApiLink'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; + +import BeautifulSoupExample from '!!raw-loader!roa-loader!./code_examples/beautifulsoup_crawler.py'; + +This example demonstrates how to use <ApiLink to="class/BeautifulSoupCrawler">`BeautifulSoupCrawler`</ApiLink> to crawl a list of URLs, load each URL using a plain HTTP request, parse the HTML using the [BeautifulSoup](https://pypi.org/project/beautifulsoup4/) library and extract some data from it - the page title and all `<h1>`, `<h2>` and `<h3>` tags. This setup is perfect for scraping specific elements from web pages. Thanks to the well-known BeautifulSoup, you can easily navigate the HTML structure and retrieve the data you need with minimal code. It also shows how you can add optional pre-navigation hook to the crawler. Pre-navigation hooks are user defined functions that execute before sending the request. + +<RunnableCodeBlock className="language-python" language="python"> + {BeautifulSoupExample} +</RunnableCodeBlock> diff --git a/website/versioned_docs/version-1.6/examples/capture_screenshot_using_playwright.mdx b/website/versioned_docs/version-1.6/examples/capture_screenshot_using_playwright.mdx new file mode 100644 index 0000000000..614693b1e8 --- /dev/null +++ b/website/versioned_docs/version-1.6/examples/capture_screenshot_using_playwright.mdx @@ -0,0 +1,19 @@ +--- +id: capture-screenshots-using-playwright +title: Capture screenshots using Playwright +--- + +import ApiLink from '@site/src/components/ApiLink'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; + +import CaptureScreenshotExample from '!!raw-loader!roa-loader!./code_examples/capture_screenshot_using_playwright.py'; + +This example demonstrates how to capture screenshots of web pages using <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> and store them in the key-value store. + +The <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> is configured to automate the browsing and interaction with web pages. It uses headless Chromium as the browser type to perform these tasks. Each web page specified in the initial list of URLs is visited sequentially, and a screenshot of the page is captured using Playwright's `page.screenshot()` method. + +The captured screenshots are stored in the key-value store, which is suitable for managing and storing files in various formats. In this case, screenshots are stored as PNG images with a unique key generated from the URL of the page. + +<RunnableCodeBlock className="language-python" language="python"> + {CaptureScreenshotExample} +</RunnableCodeBlock> diff --git a/website/versioned_docs/version-1.6/examples/capturing_page_snapshots_with_error_snapshotter.mdx b/website/versioned_docs/version-1.6/examples/capturing_page_snapshots_with_error_snapshotter.mdx new file mode 100644 index 0000000000..87ff540298 --- /dev/null +++ b/website/versioned_docs/version-1.6/examples/capturing_page_snapshots_with_error_snapshotter.mdx @@ -0,0 +1,27 @@ +--- +id: capturing-page-snapshots-with-error-snapshotter +title: Capturing page snapshots with ErrorSnapshotter +description: How to capture page snapshots on errors. +--- +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; +import ApiLink from '@site/src/components/ApiLink'; +import ParselCrawlerWithErrorSnapshotter from '!!raw-loader!roa-loader!./code_examples/parsel_crawler_with_error_snapshotter.py'; +import PlaywrightCrawlerWithErrorSnapshotter from '!!raw-loader!roa-loader!./code_examples/playwright_crawler_with_error_snapshotter.py'; + + +This example demonstrates how to capture page snapshots on first occurrence of each unique error. The capturing happens automatically if you set `save_error_snapshots=True` in the crawler's <ApiLink to="class/Statistics">`Statistics`</ApiLink>. The error snapshot can contain `html` file and `jpeg` file that are created from the page where the unhandled exception was raised. Captured error snapshot files are saved to the default key-value store. Both <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> and [HTTP crawlers](../guides/http-crawlers) are capable of capturing the html file, but only <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> is able to capture page screenshot as well. + +<Tabs> + <TabItem value="ParselCrawler" label="ParselCrawler"> + <RunnableCodeBlock className="language-python" language="python"> + { ParselCrawlerWithErrorSnapshotter } + </RunnableCodeBlock> + </TabItem> + <TabItem value="PlaywrightCrawler" label="PlaywrightCrawler"> + <RunnableCodeBlock className="language-python" language="python"> + { PlaywrightCrawlerWithErrorSnapshotter } + </RunnableCodeBlock> + </TabItem> +</Tabs> diff --git a/website/versioned_docs/version-1.6/examples/code_examples/adaptive_playwright_crawler.py b/website/versioned_docs/version-1.6/examples/code_examples/adaptive_playwright_crawler.py new file mode 100644 index 0000000000..904a000379 --- /dev/null +++ b/website/versioned_docs/version-1.6/examples/code_examples/adaptive_playwright_crawler.py @@ -0,0 +1,66 @@ +import asyncio +from datetime import timedelta + +from playwright.async_api import Route + +from crawlee.crawlers import ( + AdaptivePlaywrightCrawler, + AdaptivePlaywrightCrawlingContext, + AdaptivePlaywrightPreNavCrawlingContext, +) + + +async def main() -> None: + # Crawler created by following factory method will use `beautifulsoup` + # for parsing static content. + crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser( + max_requests_per_crawl=10, # Limit the max requests per crawl. + playwright_crawler_specific_kwargs={'headless': False}, + ) + + @crawler.router.default_handler + async def request_handler_for_label( + context: AdaptivePlaywrightCrawlingContext, + ) -> None: + # Do some processing using `parsed_content` + context.log.info(context.parsed_content.title) + + # Locate element h2 within 5 seconds + h2 = await context.query_selector_one('h2', timedelta(milliseconds=5000)) + # Do stuff with element found by the selector + context.log.info(h2) + + # Find more links and enqueue them. + await context.enqueue_links() + # Save some data. + await context.push_data({'Visited url': context.request.url}) + + @crawler.pre_navigation_hook + async def hook(context: AdaptivePlaywrightPreNavCrawlingContext) -> None: + """Hook executed both in static sub crawler and playwright sub crawler. + + Trying to access `context.page` in this hook would raise `AdaptiveContextError` + for pages crawled without playwright.""" + context.log.info(f'pre navigation hook for: {context.request.url} ...') + + @crawler.pre_navigation_hook(playwright_only=True) + async def hook_playwright(context: AdaptivePlaywrightPreNavCrawlingContext) -> None: + """Hook executed only in playwright sub crawler. + + It is safe to access `page` object. + """ + + async def some_routing_function(route: Route) -> None: + await route.continue_() + + await context.page.route('*/**', some_routing_function) + context.log.info( + f'Playwright only pre navigation hook for: {context.request.url} ...' + ) + + # Run the crawler with the initial list of URLs. + await crawler.run(['https://warehouse-theme-metal.myshopify.com/']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/examples/code_examples/add_data_to_dataset_bs.py b/website/versioned_docs/version-1.6/examples/code_examples/add_data_to_dataset_bs.py new file mode 100644 index 0000000000..4318cbe0d4 --- /dev/null +++ b/website/versioned_docs/version-1.6/examples/code_examples/add_data_to_dataset_bs.py @@ -0,0 +1,35 @@ +import asyncio + +from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext + + +async def main() -> None: + crawler = BeautifulSoupCrawler() + + # Define the default request handler, which will be called for every request. + @crawler.router.default_handler + async def request_handler(context: BeautifulSoupCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # Extract data from the page. + data = { + 'url': context.request.url, + 'title': context.soup.title.string if context.soup.title else None, + 'html': str(context.soup)[:1000], + } + + # Push the extracted data to the default dataset. + await context.push_data(data) + + # Run the crawler with the initial list of requests. + await crawler.run( + [ + 'https://crawlee.dev', + 'https://apify.com', + 'https://example.com', + ] + ) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/examples/code_examples/add_data_to_dataset_dataset.py b/website/versioned_docs/version-1.6/examples/code_examples/add_data_to_dataset_dataset.py new file mode 100644 index 0000000000..b1d9aba923 --- /dev/null +++ b/website/versioned_docs/version-1.6/examples/code_examples/add_data_to_dataset_dataset.py @@ -0,0 +1,15 @@ +import asyncio + +from crawlee.storages import Dataset + + +async def main() -> None: + # Open dataset manually using asynchronous constructor open(). + dataset = await Dataset.open() + + # Interact with dataset directly. + await dataset.push_data({'key': 'value'}) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/examples/code_examples/add_data_to_dataset_pw.py b/website/versioned_docs/version-1.6/examples/code_examples/add_data_to_dataset_pw.py new file mode 100644 index 0000000000..8eb714aef3 --- /dev/null +++ b/website/versioned_docs/version-1.6/examples/code_examples/add_data_to_dataset_pw.py @@ -0,0 +1,35 @@ +import asyncio + +from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext + + +async def main() -> None: + crawler = PlaywrightCrawler() + + # Define the default request handler, which will be called for every request. + @crawler.router.default_handler + async def request_handler(context: PlaywrightCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # Extract data from the page. + data = { + 'url': context.request.url, + 'title': await context.page.title(), + 'html': str(await context.page.content())[:1000], + } + + # Push the extracted data to the default dataset. + await context.push_data(data) + + # Run the crawler with the initial list of requests. + await crawler.run( + [ + 'https://crawlee.dev', + 'https://apify.com', + 'https://example.com', + ] + ) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/examples/code_examples/beautifulsoup_crawler.py b/website/versioned_docs/version-1.6/examples/code_examples/beautifulsoup_crawler.py new file mode 100644 index 0000000000..5e9701d7cb --- /dev/null +++ b/website/versioned_docs/version-1.6/examples/code_examples/beautifulsoup_crawler.py @@ -0,0 +1,57 @@ +import asyncio +from datetime import timedelta + +from crawlee.crawlers import ( + BasicCrawlingContext, + BeautifulSoupCrawler, + BeautifulSoupCrawlingContext, +) + + +async def main() -> None: + # Create an instance of the BeautifulSoupCrawler class, a crawler that automatically + # loads the URLs and parses their HTML using the BeautifulSoup library. + crawler = BeautifulSoupCrawler( + # On error, retry each page at most once. + max_request_retries=1, + # Increase the timeout for processing each page to 30 seconds. + request_handler_timeout=timedelta(seconds=30), + # Limit the crawl to max requests. Remove or increase it for crawling all links. + max_requests_per_crawl=10, + ) + + # Define the default request handler, which will be called for every request. + # The handler receives a context parameter, providing various properties and + # helper methods. Here are a few key ones we use for demonstration: + # - request: an instance of the Request class containing details such as the URL + # being crawled and the HTTP method used. + # - soup: the BeautifulSoup object containing the parsed HTML of the response. + @crawler.router.default_handler + async def request_handler(context: BeautifulSoupCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # Extract data from the page. + data = { + 'url': context.request.url, + 'title': context.soup.title.string if context.soup.title else None, + 'h1s': [h1.text for h1 in context.soup.find_all('h1')], + 'h2s': [h2.text for h2 in context.soup.find_all('h2')], + 'h3s': [h3.text for h3 in context.soup.find_all('h3')], + } + + # Push the extracted data to the default dataset. In local configuration, + # the data will be stored as JSON files in ./storage/datasets/default. + await context.push_data(data) + + # Register pre navigation hook which will be called before each request. + # This hook is optional and does not need to be defined at all. + @crawler.pre_navigation_hook + async def some_hook(context: BasicCrawlingContext) -> None: + pass + + # Run the crawler with the initial list of URLs. + await crawler.run(['https://crawlee.dev']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/examples/code_examples/beautifulsoup_crawler_keep_alive.py b/website/versioned_docs/version-1.6/examples/code_examples/beautifulsoup_crawler_keep_alive.py new file mode 100644 index 0000000000..38e5623939 --- /dev/null +++ b/website/versioned_docs/version-1.6/examples/code_examples/beautifulsoup_crawler_keep_alive.py @@ -0,0 +1,56 @@ +import asyncio + +from crawlee._types import BasicCrawlingContext +from crawlee.crawlers import BeautifulSoupCrawler + + +async def main() -> None: + crawler = BeautifulSoupCrawler( + # Keep the crawler alive even when there are no requests to be processed now. + keep_alive=True, + ) + + def stop_crawler_if_url_visited(context: BasicCrawlingContext) -> None: + """Stop crawler once specific url is visited. + + Example of guard condition to stop the crawler.""" + if context.request.url == 'https://crawlee.dev/docs/examples': + crawler.stop( + 'Stop crawler that was in keep_alive state after specific url was visite' + ) + else: + context.log.info('keep_alive=True, waiting for more requests to come.') + + async def add_request_later(url: str, after_s: int) -> None: + """Add requests to the queue after some time. Can be done by external code.""" + # Just an example of request being added to the crawler later, + # when it is waiting due to `keep_alive=True`. + await asyncio.sleep(after_s) + await crawler.add_requests([url]) + + # Define the default request handler, which will be called for every request. + @crawler.router.default_handler + async def request_handler(context: BasicCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # Stop crawler if some guard condition has been met. + stop_crawler_if_url_visited(context) + + # Start some tasks that will add some requests later to simulate real situation, + # where requests are added later by external code. + add_request_later_task1 = asyncio.create_task( + add_request_later(url='https://crawlee.dev', after_s=1) + ) + add_request_later_task2 = asyncio.create_task( + add_request_later(url='https://crawlee.dev/docs/examples', after_s=5) + ) + + # Run the crawler without the initial list of requests. + # Wait for more requests to be added to the queue later due to `keep_alive=True`. + await crawler.run() + + await asyncio.gather(add_request_later_task1, add_request_later_task2) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/examples/code_examples/beautifulsoup_crawler_stop.py b/website/versioned_docs/version-1.6/examples/code_examples/beautifulsoup_crawler_stop.py new file mode 100644 index 0000000000..2069bd6ecb --- /dev/null +++ b/website/versioned_docs/version-1.6/examples/code_examples/beautifulsoup_crawler_stop.py @@ -0,0 +1,41 @@ +import asyncio + +from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext + + +async def main() -> None: + # Create an instance of the BeautifulSoupCrawler class, a crawler that automatically + # loads the URLs and parses their HTML using the BeautifulSoup library. + crawler = BeautifulSoupCrawler() + + # Define the default request handler, which will be called for every request. + # The handler receives a context parameter, providing various properties and + # helper methods. Here are a few key ones we use for demonstration: + # - request: an instance of the Request class containing details such as the URL + # being crawled and the HTTP method used. + # - soup: the BeautifulSoup object containing the parsed HTML of the response. + @crawler.router.default_handler + async def request_handler(context: BeautifulSoupCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # Create custom condition to stop crawler once it finds what it is looking for. + if 'crawlee' in context.request.url: + crawler.stop( + reason='Manual stop of crawler after finding `crawlee` in the url.' + ) + + # Extract data from the page. + data = { + 'url': context.request.url, + } + + # Push the extracted data to the default dataset. In local configuration, + # the data will be stored as JSON files in ./storage/datasets/default. + await context.push_data(data) + + # Run the crawler with the initial list of URLs. + await crawler.run(['https://crawlee.dev']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/examples/code_examples/capture_screenshot_using_playwright.py b/website/versioned_docs/version-1.6/examples/code_examples/capture_screenshot_using_playwright.py new file mode 100644 index 0000000000..e4b4c1ec22 --- /dev/null +++ b/website/versioned_docs/version-1.6/examples/code_examples/capture_screenshot_using_playwright.py @@ -0,0 +1,47 @@ +import asyncio + +from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext +from crawlee.storages import KeyValueStore + + +async def main() -> None: + crawler = PlaywrightCrawler( + # Limit the crawl to max requests. Remove or increase it for crawling all links. + max_requests_per_crawl=10, + # Headless mode, set to False to see the browser in action. + headless=False, + # Browser types supported by Playwright. + browser_type='chromium', + ) + + # Open the default key-value store. + kvs = await KeyValueStore.open() + + # Define the default request handler, which will be called for every request. + @crawler.router.default_handler + async def request_handler(context: PlaywrightCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # Capture the screenshot of the page using Playwright's API. + screenshot = await context.page.screenshot() + name = context.request.url.split('/')[-1] + + # Store the screenshot in the key-value store. + await kvs.set_value( + key=f'screenshot-{name}', + value=screenshot, + content_type='image/png', + ) + + # Run the crawler with the initial list of URLs. + await crawler.run( + [ + 'https://crawlee.dev', + 'https://apify.com', + 'https://example.com', + ] + ) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/examples/code_examples/configure_json_logging.py b/website/versioned_docs/version-1.6/examples/code_examples/configure_json_logging.py new file mode 100644 index 0000000000..25cb37c745 --- /dev/null +++ b/website/versioned_docs/version-1.6/examples/code_examples/configure_json_logging.py @@ -0,0 +1,91 @@ +from __future__ import annotations + +import asyncio +import inspect +import logging +import sys +from typing import TYPE_CHECKING + +from loguru import logger + +from crawlee.crawlers import HttpCrawler, HttpCrawlingContext + +if TYPE_CHECKING: + from loguru import Record + + +# Configure loguru interceptor to capture standard logging output +class InterceptHandler(logging.Handler): + def emit(self, record: logging.LogRecord) -> None: + # Get corresponding Loguru level if it exists + try: + level: str | int = logger.level(record.levelname).name + except ValueError: + level = record.levelno + + # Find caller from where originated the logged message + frame, depth = inspect.currentframe(), 0 + while frame: + filename = frame.f_code.co_filename + is_logging = filename == logging.__file__ + is_frozen = 'importlib' in filename and '_bootstrap' in filename + if depth > 0 and not (is_logging | is_frozen): + break + frame = frame.f_back + depth += 1 + + dummy_record = logging.LogRecord('dummy', 0, 'dummy', 0, 'dummy', None, None) + standard_attrs = set(dummy_record.__dict__.keys()) + extra_dict = { + key: value + for key, value in record.__dict__.items() + if key not in standard_attrs + } + + ( + logger.bind(**extra_dict) + .opt(depth=depth, exception=record.exc_info) + .patch(lambda loguru_record: loguru_record.update({'name': record.name})) + .log(level, record.getMessage()) + ) + + +# Configure loguru formatter +def formatter(record: Record) -> str: + basic_format = '[{name}] | <level>{level: ^8}</level> | - {message}' + if record['extra']: + basic_format = basic_format + ' {extra}' + return f'{basic_format}\n' + + +# Remove default loguru logger +logger.remove() + +# Set up loguru with JSONL serialization in file `crawler.log` +logger.add('crawler.log', format=formatter, serialize=True, level='INFO') + +# Set up loguru logger for console +logger.add(sys.stderr, format=formatter, colorize=True, level='INFO') + +# Configure standard logging to use our interceptor +logging.basicConfig(handlers=[InterceptHandler()], level=logging.INFO, force=True) + + +async def main() -> None: + # Initialize crawler with disabled table logs + crawler = HttpCrawler( + configure_logging=False, # Disable default logging configuration + statistics_log_format='inline', # Set inline formatting for statistics logs + ) + + # Define the default request handler, which will be called for every request. + @crawler.router.default_handler + async def request_handler(context: HttpCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # Run the crawler + await crawler.run(['https://www.crawlee.dev/']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/examples/code_examples/crawl_all_links_on_website_bs.py b/website/versioned_docs/version-1.6/examples/code_examples/crawl_all_links_on_website_bs.py new file mode 100644 index 0000000000..ad5ef62f54 --- /dev/null +++ b/website/versioned_docs/version-1.6/examples/code_examples/crawl_all_links_on_website_bs.py @@ -0,0 +1,25 @@ +import asyncio + +from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext + + +async def main() -> None: + crawler = BeautifulSoupCrawler( + # Limit the crawl to max requests. Remove or increase it for crawling all links. + max_requests_per_crawl=10, + ) + + # Define the default request handler, which will be called for every request. + @crawler.router.default_handler + async def request_handler(context: BeautifulSoupCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # Enqueue all links found on the page. + await context.enqueue_links() + + # Run the crawler with the initial list of requests. + await crawler.run(['https://crawlee.dev']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/examples/code_examples/crawl_all_links_on_website_pw.py b/website/versioned_docs/version-1.6/examples/code_examples/crawl_all_links_on_website_pw.py new file mode 100644 index 0000000000..4a6fb6e616 --- /dev/null +++ b/website/versioned_docs/version-1.6/examples/code_examples/crawl_all_links_on_website_pw.py @@ -0,0 +1,25 @@ +import asyncio + +from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext + + +async def main() -> None: + crawler = PlaywrightCrawler( + # Limit the crawl to max requests. Remove or increase it for crawling all links. + max_requests_per_crawl=10, + ) + + # Define the default request handler, which will be called for every request. + @crawler.router.default_handler + async def request_handler(context: PlaywrightCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # Enqueue all links found on the page. + await context.enqueue_links() + + # Run the crawler with the initial list of requests. + await crawler.run(['https://crawlee.dev']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/examples/code_examples/crawl_multiple_urls_bs.py b/website/versioned_docs/version-1.6/examples/code_examples/crawl_multiple_urls_bs.py new file mode 100644 index 0000000000..e8cf82f2bc --- /dev/null +++ b/website/versioned_docs/version-1.6/examples/code_examples/crawl_multiple_urls_bs.py @@ -0,0 +1,25 @@ +import asyncio + +from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext + + +async def main() -> None: + crawler = BeautifulSoupCrawler() + + # Define the default request handler, which will be called for every request. + @crawler.router.default_handler + async def request_handler(context: BeautifulSoupCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # Run the crawler with the initial list of requests. + await crawler.run( + [ + 'https://crawlee.dev', + 'https://apify.com', + 'https://example.com', + ] + ) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/examples/code_examples/crawl_multiple_urls_pw.py b/website/versioned_docs/version-1.6/examples/code_examples/crawl_multiple_urls_pw.py new file mode 100644 index 0000000000..b18d04c8ad --- /dev/null +++ b/website/versioned_docs/version-1.6/examples/code_examples/crawl_multiple_urls_pw.py @@ -0,0 +1,25 @@ +import asyncio + +from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext + + +async def main() -> None: + crawler = PlaywrightCrawler() + + # Define the default request handler, which will be called for every request. + @crawler.router.default_handler + async def request_handler(context: PlaywrightCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # Run the crawler with the initial list of requests. + await crawler.run( + [ + 'https://crawlee.dev', + 'https://apify.com', + 'https://example.com', + ] + ) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/examples/code_examples/crawl_specific_links_on_website_bs.py b/website/versioned_docs/version-1.6/examples/code_examples/crawl_specific_links_on_website_bs.py new file mode 100644 index 0000000000..8dfc1bdf85 --- /dev/null +++ b/website/versioned_docs/version-1.6/examples/code_examples/crawl_specific_links_on_website_bs.py @@ -0,0 +1,29 @@ +import asyncio + +from crawlee import Glob +from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext + + +async def main() -> None: + crawler = BeautifulSoupCrawler( + # Limit the crawl to max requests. Remove or increase it for crawling all links. + max_requests_per_crawl=10, + ) + + # Define the default request handler, which will be called for every request. + @crawler.router.default_handler + async def request_handler(context: BeautifulSoupCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # Enqueue all the documentation links found on the page, except for the examples. + await context.enqueue_links( + include=[Glob('https://crawlee.dev/docs/**')], + exclude=[Glob('https://crawlee.dev/docs/examples')], + ) + + # Run the crawler with the initial list of requests. + await crawler.run(['https://crawlee.dev']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/examples/code_examples/crawl_specific_links_on_website_pw.py b/website/versioned_docs/version-1.6/examples/code_examples/crawl_specific_links_on_website_pw.py new file mode 100644 index 0000000000..98a2f0435b --- /dev/null +++ b/website/versioned_docs/version-1.6/examples/code_examples/crawl_specific_links_on_website_pw.py @@ -0,0 +1,29 @@ +import asyncio + +from crawlee import Glob +from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext + + +async def main() -> None: + crawler = PlaywrightCrawler( + # Limit the crawl to max requests. Remove or increase it for crawling all links. + max_requests_per_crawl=10, + ) + + # Define the default request handler, which will be called for every request. + @crawler.router.default_handler + async def request_handler(context: PlaywrightCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # Enqueue all the documentation links found on the page, except for the examples. + await context.enqueue_links( + include=[Glob('https://crawlee.dev/docs/**')], + exclude=[Glob('https://crawlee.dev/docs/examples')], + ) + + # Run the crawler with the initial list of requests. + await crawler.run(['https://crawlee.dev']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/examples/code_examples/crawl_website_with_relative_links_all_links.py b/website/versioned_docs/version-1.6/examples/code_examples/crawl_website_with_relative_links_all_links.py new file mode 100644 index 0000000000..b253a9566f --- /dev/null +++ b/website/versioned_docs/version-1.6/examples/code_examples/crawl_website_with_relative_links_all_links.py @@ -0,0 +1,26 @@ +import asyncio + +from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext + + +async def main() -> None: + crawler = BeautifulSoupCrawler( + # Limit the crawl to max requests. Remove or increase it for crawling all links. + max_requests_per_crawl=10, + ) + + # Define the default request handler, which will be called for every request. + @crawler.router.default_handler + async def request_handler(context: BeautifulSoupCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # Enqueue all links found on the page. Any URLs found will be matched by + # this strategy, even if they go off the site you are currently crawling. + await context.enqueue_links(strategy='all') + + # Run the crawler with the initial list of requests. + await crawler.run(['https://crawlee.dev']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/examples/code_examples/crawl_website_with_relative_links_same_domain.py b/website/versioned_docs/version-1.6/examples/code_examples/crawl_website_with_relative_links_same_domain.py new file mode 100644 index 0000000000..0fa264ef20 --- /dev/null +++ b/website/versioned_docs/version-1.6/examples/code_examples/crawl_website_with_relative_links_same_domain.py @@ -0,0 +1,26 @@ +import asyncio + +from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext + + +async def main() -> None: + crawler = BeautifulSoupCrawler( + # Limit the crawl to max requests. Remove or increase it for crawling all links. + max_requests_per_crawl=10, + ) + + # Define the default request handler, which will be called for every request. + @crawler.router.default_handler + async def request_handler(context: BeautifulSoupCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # Setting the strategy to same domain will enqueue all links found that + # are on the same hostname as request.loaded_url or request.url. + await context.enqueue_links(strategy='same-domain') + + # Run the crawler with the initial list of requests. + await crawler.run(['https://crawlee.dev']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/examples/code_examples/crawl_website_with_relative_links_same_hostname.py b/website/versioned_docs/version-1.6/examples/code_examples/crawl_website_with_relative_links_same_hostname.py new file mode 100644 index 0000000000..0259cafe67 --- /dev/null +++ b/website/versioned_docs/version-1.6/examples/code_examples/crawl_website_with_relative_links_same_hostname.py @@ -0,0 +1,26 @@ +import asyncio + +from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext + + +async def main() -> None: + crawler = BeautifulSoupCrawler( + # Limit the crawl to max requests. Remove or increase it for crawling all links. + max_requests_per_crawl=10, + ) + + # Define the default request handler, which will be called for every request. + @crawler.router.default_handler + async def request_handler(context: BeautifulSoupCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # Setting the strategy to same hostname will enqueue all links found that are on + # the same hostname (including subdomains) as request.loaded_url or request.url. + await context.enqueue_links(strategy='same-hostname') + + # Run the crawler with the initial list of requests. + await crawler.run(['https://crawlee.dev']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/examples/code_examples/crawl_website_with_relative_links_same_origin.py b/website/versioned_docs/version-1.6/examples/code_examples/crawl_website_with_relative_links_same_origin.py new file mode 100644 index 0000000000..46e9f32759 --- /dev/null +++ b/website/versioned_docs/version-1.6/examples/code_examples/crawl_website_with_relative_links_same_origin.py @@ -0,0 +1,26 @@ +import asyncio + +from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext + + +async def main() -> None: + crawler = BeautifulSoupCrawler( + # Limit the crawl to max requests. Remove or increase it for crawling all links. + max_requests_per_crawl=10, + ) + + # Define the default request handler, which will be called for every request. + @crawler.router.default_handler + async def request_handler(context: BeautifulSoupCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # Setting the strategy to same origin will enqueue all links found that are on + # the same origin as request.loaded_url or request.url. + await context.enqueue_links(strategy='same-origin') + + # Run the crawler with the initial list of requests. + await crawler.run(['https://crawlee.dev']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/examples/code_examples/export_entire_dataset_to_file_csv.py b/website/versioned_docs/version-1.6/examples/code_examples/export_entire_dataset_to_file_csv.py new file mode 100644 index 0000000000..4e5369aac3 --- /dev/null +++ b/website/versioned_docs/version-1.6/examples/code_examples/export_entire_dataset_to_file_csv.py @@ -0,0 +1,38 @@ +import asyncio + +from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext + + +async def main() -> None: + crawler = BeautifulSoupCrawler( + # Limit the crawl to max requests. Remove or increase it for crawling all links. + max_requests_per_crawl=10, + ) + + # Define the default request handler, which will be called for every request. + @crawler.router.default_handler + async def request_handler(context: BeautifulSoupCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # Extract data from the page. + data = { + 'url': context.request.url, + 'title': context.soup.title.string if context.soup.title else None, + } + + # Enqueue all links found on the page. + await context.enqueue_links() + + # Push the extracted data to the default dataset. + await context.push_data(data) + + # Run the crawler with the initial list of URLs. + await crawler.run(['https://crawlee.dev']) + + # Export the entire dataset to a CSV file. + # Use semicolon as delimiter and always quote strings. + await crawler.export_data(path='results.csv', delimiter=';', quoting='all') + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/examples/code_examples/export_entire_dataset_to_file_json.py b/website/versioned_docs/version-1.6/examples/code_examples/export_entire_dataset_to_file_json.py new file mode 100644 index 0000000000..2fc2d6f043 --- /dev/null +++ b/website/versioned_docs/version-1.6/examples/code_examples/export_entire_dataset_to_file_json.py @@ -0,0 +1,38 @@ +import asyncio + +from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext + + +async def main() -> None: + crawler = BeautifulSoupCrawler( + # Limit the crawl to max requests. Remove or increase it for crawling all links. + max_requests_per_crawl=10, + ) + + # Define the default request handler, which will be called for every request. + @crawler.router.default_handler + async def request_handler(context: BeautifulSoupCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # Extract data from the page. + data = { + 'url': context.request.url, + 'title': context.soup.title.string if context.soup.title else None, + } + + # Enqueue all links found on the page. + await context.enqueue_links() + + # Push the extracted data to the default dataset. + await context.push_data(data) + + # Run the crawler with the initial list of URLs. + await crawler.run(['https://crawlee.dev']) + + # Export the entire dataset to a JSON file. + # Set ensure_ascii=False to allow Unicode characters in the output. + await crawler.export_data(path='results.json', ensure_ascii=False) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/examples/code_examples/extract_and_add_specific_links_on_website_bs.py b/website/versioned_docs/version-1.6/examples/code_examples/extract_and_add_specific_links_on_website_bs.py new file mode 100644 index 0000000000..1fcafea1d6 --- /dev/null +++ b/website/versioned_docs/version-1.6/examples/code_examples/extract_and_add_specific_links_on_website_bs.py @@ -0,0 +1,36 @@ +import asyncio + +from crawlee import Glob +from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext + + +async def main() -> None: + crawler = BeautifulSoupCrawler( + # Limit the crawl to max requests. Remove or increase it for crawling all links. + max_requests_per_crawl=10, + ) + + # Define the default request handler, which will be called for every request. + @crawler.router.default_handler + async def request_handler(context: BeautifulSoupCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # Extract all the documentation links found on the page, except for the examples. + extracted_links = await context.extract_links( + include=[Glob('https://crawlee.dev/docs/**')], + exclude=[Glob('https://crawlee.dev/docs/examples')], + ) + # Some very custom filtering which can't be achieved by `extract_links` arguments. + max_link_length = 30 + filtered_links = [ + link for link in extracted_links if len(link.url) < max_link_length + ] + # Add filtered links to the request queue. + await context.add_requests(filtered_links) + + # Run the crawler with the initial list of requests. + await crawler.run(['https://crawlee.dev']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/examples/code_examples/extract_and_add_specific_links_on_website_pw.py b/website/versioned_docs/version-1.6/examples/code_examples/extract_and_add_specific_links_on_website_pw.py new file mode 100644 index 0000000000..032a25f19c --- /dev/null +++ b/website/versioned_docs/version-1.6/examples/code_examples/extract_and_add_specific_links_on_website_pw.py @@ -0,0 +1,36 @@ +import asyncio + +from crawlee import Glob +from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext + + +async def main() -> None: + crawler = PlaywrightCrawler( + # Limit the crawl to max requests. Remove or increase it for crawling all links. + max_requests_per_crawl=10, + ) + + # Define the default request handler, which will be called for every request. + @crawler.router.default_handler + async def request_handler(context: PlaywrightCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # Extract all the documentation links found on the page, except for the examples. + extracted_links = await context.extract_links( + include=[Glob('https://crawlee.dev/docs/**')], + exclude=[Glob('https://crawlee.dev/docs/examples')], + ) + # Some very custom filtering which can't be achieved by `extract_links` arguments. + max_link_length = 30 + filtered_links = [ + link for link in extracted_links if len(link.url) < max_link_length + ] + # Add filtered links to the request queue. + await context.add_requests(filtered_links) + + # Run the crawler with the initial list of requests. + await crawler.run(['https://crawlee.dev']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/examples/code_examples/fill_and_submit_web_form_crawler.py b/website/versioned_docs/version-1.6/examples/code_examples/fill_and_submit_web_form_crawler.py new file mode 100644 index 0000000000..0545c66680 --- /dev/null +++ b/website/versioned_docs/version-1.6/examples/code_examples/fill_and_submit_web_form_crawler.py @@ -0,0 +1,41 @@ +import asyncio +from urllib.parse import urlencode + +from crawlee import Request +from crawlee.crawlers import HttpCrawler, HttpCrawlingContext + + +async def main() -> None: + crawler = HttpCrawler() + + # Define the default request handler, which will be called for every request. + @crawler.router.default_handler + async def request_handler(context: HttpCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + response = (await context.http_response.read()).decode('utf-8') + context.log.info(f'Response: {response}') # To see the response in the logs. + + # Prepare a POST request to the form endpoint. + request = Request.from_url( + url='https://httpbin.org/post', + method='POST', + headers={'content-type': 'application/x-www-form-urlencoded'}, + payload=urlencode( + { + 'custname': 'John Doe', + 'custtel': '1234567890', + 'custemail': 'johndoe@example.com', + 'size': 'large', + 'topping': ['bacon', 'cheese', 'mushroom'], + 'delivery': '13:00', + 'comments': 'Please ring the doorbell upon arrival.', + } + ).encode(), + ) + + # Run the crawler with the initial list of requests. + await crawler.run([request]) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/examples/code_examples/fill_and_submit_web_form_request.py b/website/versioned_docs/version-1.6/examples/code_examples/fill_and_submit_web_form_request.py new file mode 100644 index 0000000000..14dc6c479d --- /dev/null +++ b/website/versioned_docs/version-1.6/examples/code_examples/fill_and_submit_web_form_request.py @@ -0,0 +1,28 @@ +import asyncio +from urllib.parse import urlencode + +from crawlee import Request + + +async def main() -> None: + # Prepare a POST request to the form endpoint. + request = Request.from_url( + url='https://httpbin.org/post', + method='POST', + headers={'content-type': 'application/x-www-form-urlencoded'}, + payload=urlencode( + { + 'custname': 'John Doe', + 'custtel': '1234567890', + 'custemail': 'johndoe@example.com', + 'size': 'large', + 'topping': ['bacon', 'cheese', 'mushroom'], + 'delivery': '13:00', + 'comments': 'Please ring the doorbell upon arrival.', + } + ).encode(), + ) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/examples/code_examples/parsel_crawler.py b/website/versioned_docs/version-1.6/examples/code_examples/parsel_crawler.py new file mode 100644 index 0000000000..9807d7ca3b --- /dev/null +++ b/website/versioned_docs/version-1.6/examples/code_examples/parsel_crawler.py @@ -0,0 +1,47 @@ +import asyncio + +from crawlee.crawlers import BasicCrawlingContext, ParselCrawler, ParselCrawlingContext + +# Regex for identifying email addresses on a webpage. +EMAIL_REGEX = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}' + + +async def main() -> None: + crawler = ParselCrawler( + # Limit the crawl to max requests. Remove or increase it for crawling all links. + max_requests_per_crawl=10, + ) + + # Define the default request handler, which will be called for every request. + @crawler.router.default_handler + async def request_handler(context: ParselCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # Extract data from the page. + data = { + 'url': context.request.url, + 'title': context.selector.xpath('//title/text()').get(), + 'email_address_list': context.selector.re(EMAIL_REGEX), + } + + # Push the extracted data to the default dataset. + await context.push_data(data) + + # Enqueue all links found on the page. + await context.enqueue_links() + + # Register pre navigation hook which will be called before each request. + # This hook is optional and does not need to be defined at all. + @crawler.pre_navigation_hook + async def some_hook(context: BasicCrawlingContext) -> None: + pass + + # Run the crawler with the initial list of URLs. + await crawler.run(['https://github.com']) + + # Export the entire dataset to a JSON file. + await crawler.export_data(path='results.json') + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/examples/code_examples/parsel_crawler_with_error_snapshotter.py b/website/versioned_docs/version-1.6/examples/code_examples/parsel_crawler_with_error_snapshotter.py new file mode 100644 index 0000000000..d7c3674571 --- /dev/null +++ b/website/versioned_docs/version-1.6/examples/code_examples/parsel_crawler_with_error_snapshotter.py @@ -0,0 +1,31 @@ +import asyncio +from random import choice + +from crawlee.crawlers import ParselCrawler, ParselCrawlingContext +from crawlee.statistics import Statistics + + +async def main() -> None: + crawler = ParselCrawler( + statistics=Statistics.with_default_state(save_error_snapshots=True) + ) + + @crawler.router.default_handler + async def request_handler(context: ParselCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + # Simulate various errors to demonstrate `ErrorSnapshotter` + # saving only the first occurrence of unique error. + await context.enqueue_links() + random_number = choice(range(10)) + if random_number == 1: + raise KeyError('Some KeyError') + if random_number == 2: + raise ValueError('Some ValueError') + if random_number == 3: + raise RuntimeError('Some RuntimeError') + + await crawler.run(['https://crawlee.dev']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/examples/code_examples/playwright_block_requests.py b/website/versioned_docs/version-1.6/examples/code_examples/playwright_block_requests.py new file mode 100644 index 0000000000..991a67aede --- /dev/null +++ b/website/versioned_docs/version-1.6/examples/code_examples/playwright_block_requests.py @@ -0,0 +1,36 @@ +import asyncio + +from crawlee.crawlers import ( + PlaywrightCrawler, + PlaywrightCrawlingContext, + PlaywrightPreNavCrawlingContext, +) + + +async def main() -> None: + crawler = PlaywrightCrawler( + # Limit the crawl to max requests. Remove or increase it for crawling all links. + max_requests_per_crawl=10, + ) + + # Define the default request handler, which will be called for every request. + @crawler.router.default_handler + async def request_handler(context: PlaywrightCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + await context.enqueue_links() + + # Define the hook, which will be called before every request. + @crawler.pre_navigation_hook + async def navigation_hook(context: PlaywrightPreNavCrawlingContext) -> None: + context.log.info(f'Navigating to {context.request.url} ...') + + # Block all requests to URLs that include `adsbygoogle.js` and also all defaults. + await context.block_requests(extra_url_patterns=['adsbygoogle.js']) + + # Run the crawler with the initial list of URLs. + await crawler.run(['https://crawlee.dev/']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/examples/code_examples/playwright_crawler.py b/website/versioned_docs/version-1.6/examples/code_examples/playwright_crawler.py new file mode 100644 index 0000000000..f35332b063 --- /dev/null +++ b/website/versioned_docs/version-1.6/examples/code_examples/playwright_crawler.py @@ -0,0 +1,67 @@ +import asyncio + +from crawlee.crawlers import ( + PlaywrightCrawler, + PlaywrightCrawlingContext, + PlaywrightPreNavCrawlingContext, +) + + +async def main() -> None: + crawler = PlaywrightCrawler( + # Limit the crawl to max requests. Remove or increase it for crawling all links. + max_requests_per_crawl=10, + # Headless mode, set to False to see the browser in action. + headless=False, + # Browser types supported by Playwright. + browser_type='chromium', + ) + + # Define the default request handler, which will be called for every request. + # The handler receives a context parameter, providing various properties and + # helper methods. Here are a few key ones we use for demonstration: + # - request: an instance of the Request class containing details such as the URL + # being crawled and the HTTP method used. + # - page: Playwright's Page object, which allows interaction with the web page + # (see https://playwright.dev/python/docs/api/class-page for more details). + @crawler.router.default_handler + async def request_handler(context: PlaywrightCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # Extract data from the page using Playwright's API. + posts = await context.page.query_selector_all('.athing') + data = [] + + for post in posts: + # Get the HTML elements for the title and rank within each post. + title_element = await post.query_selector('.title a') + rank_element = await post.query_selector('.rank') + + # Extract the data we want from the elements. + title = await title_element.inner_text() if title_element else None + rank = await rank_element.inner_text() if rank_element else None + href = await title_element.get_attribute('href') if title_element else None + + data.append({'title': title, 'rank': rank, 'href': href}) + + # Push the extracted data to the default dataset. In local configuration, + # the data will be stored as JSON files in ./storage/datasets/default. + await context.push_data(data) + + # Find a link to the next page and enqueue it if it exists. + await context.enqueue_links(selector='.morelink') + + # Define a hook that will be called each time before navigating to a new URL. + # The hook receives a context parameter, providing access to the request and + # browser page among other things. In this example, we log the URL being + # navigated to. + @crawler.pre_navigation_hook + async def log_navigation_url(context: PlaywrightPreNavCrawlingContext) -> None: + context.log.info(f'Navigating to {context.request.url} ...') + + # Run the crawler with the initial list of URLs. + await crawler.run(['https://news.ycombinator.com/']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/examples/code_examples/playwright_crawler_with_camoufox.py b/website/versioned_docs/version-1.6/examples/code_examples/playwright_crawler_with_camoufox.py new file mode 100644 index 0000000000..691197da55 --- /dev/null +++ b/website/versioned_docs/version-1.6/examples/code_examples/playwright_crawler_with_camoufox.py @@ -0,0 +1,69 @@ +import asyncio + +# Camoufox is external package and needs to be installed. It is not included in crawlee. +from camoufox import AsyncNewBrowser +from typing_extensions import override + +from crawlee.browsers import ( + BrowserPool, + PlaywrightBrowserController, + PlaywrightBrowserPlugin, +) +from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext + + +class CamoufoxPlugin(PlaywrightBrowserPlugin): + """Example browser plugin that uses Camoufox browser, + but otherwise keeps the functionality of PlaywrightBrowserPlugin. + """ + + @override + async def new_browser(self) -> PlaywrightBrowserController: + if not self._playwright: + raise RuntimeError('Playwright browser plugin is not initialized.') + + return PlaywrightBrowserController( + browser=await AsyncNewBrowser( + self._playwright, **self._browser_launch_options + ), + # Increase, if camoufox can handle it in your use case. + max_open_pages_per_browser=1, + # This turns off the crawlee header_generation. Camoufox has its own. + header_generator=None, + ) + + +async def main() -> None: + crawler = PlaywrightCrawler( + # Limit the crawl to max requests. Remove or increase it for crawling all links. + max_requests_per_crawl=10, + # Custom browser pool. Gives users full control over browsers used by the crawler. + browser_pool=BrowserPool(plugins=[CamoufoxPlugin()]), + ) + + # Define the default request handler, which will be called for every request. + @crawler.router.default_handler + async def request_handler(context: PlaywrightCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # Extract some data from the page using Playwright's API. + posts = await context.page.query_selector_all('.athing') + for post in posts: + # Get the HTML elements for the title and rank within each post. + title_element = await post.query_selector('.title a') + + # Extract the data we want from the elements. + title = await title_element.inner_text() if title_element else None + + # Push the extracted data to the default dataset. + await context.push_data({'title': title}) + + # Find a link to the next page and enqueue it if it exists. + await context.enqueue_links(selector='.morelink') + + # Run the crawler with the initial list of URLs. + await crawler.run(['https://news.ycombinator.com/']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/examples/code_examples/playwright_crawler_with_error_snapshotter.py b/website/versioned_docs/version-1.6/examples/code_examples/playwright_crawler_with_error_snapshotter.py new file mode 100644 index 0000000000..90ddc6c3d4 --- /dev/null +++ b/website/versioned_docs/version-1.6/examples/code_examples/playwright_crawler_with_error_snapshotter.py @@ -0,0 +1,31 @@ +import asyncio +from random import choice + +from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext +from crawlee.statistics import Statistics + + +async def main() -> None: + crawler = PlaywrightCrawler( + statistics=Statistics.with_default_state(save_error_snapshots=True) + ) + + @crawler.router.default_handler + async def request_handler(context: PlaywrightCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + # Simulate various errors to demonstrate `ErrorSnapshotter` + # saving only the first occurrence of unique error. + await context.enqueue_links() + random_number = choice(range(10)) + if random_number == 1: + raise KeyError('Some KeyError') + if random_number == 2: + raise ValueError('Some ValueError') + if random_number == 3: + raise RuntimeError('Some RuntimeError') + + await crawler.run(['https://crawlee.dev']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/examples/code_examples/playwright_crawler_with_fingerprint_generator.py b/website/versioned_docs/version-1.6/examples/code_examples/playwright_crawler_with_fingerprint_generator.py new file mode 100644 index 0000000000..24cb5bb907 --- /dev/null +++ b/website/versioned_docs/version-1.6/examples/code_examples/playwright_crawler_with_fingerprint_generator.py @@ -0,0 +1,44 @@ +import asyncio + +from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext +from crawlee.fingerprint_suite import ( + DefaultFingerprintGenerator, + HeaderGeneratorOptions, + ScreenOptions, +) + + +async def main() -> None: + # Use default fingerprint generator with desired fingerprint options. + # Generator will generate real looking browser fingerprint based on the options. + # Unspecified fingerprint options will be automatically selected by the generator. + fingerprint_generator = DefaultFingerprintGenerator( + header_options=HeaderGeneratorOptions(browsers=['chrome']), + screen_options=ScreenOptions(min_width=400), + ) + + crawler = PlaywrightCrawler( + # Limit the crawl to max requests. Remove or increase it for crawling all links. + max_requests_per_crawl=10, + # Headless mode, set to False to see the browser in action. + headless=False, + # Browser types supported by Playwright. + browser_type='chromium', + # Fingerprint generator to be used. By default no fingerprint generation is done. + fingerprint_generator=fingerprint_generator, + ) + + # Define the default request handler, which will be called for every request. + @crawler.router.default_handler + async def request_handler(context: PlaywrightCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # Find a link to the next page and enqueue it if it exists. + await context.enqueue_links(selector='.morelink') + + # Run the crawler with the initial list of URLs. + await crawler.run(['https://news.ycombinator.com/']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/examples/code_examples/respect_robots_on_skipped_request.py b/website/versioned_docs/version-1.6/examples/code_examples/respect_robots_on_skipped_request.py new file mode 100644 index 0000000000..5c7eca173f --- /dev/null +++ b/website/versioned_docs/version-1.6/examples/code_examples/respect_robots_on_skipped_request.py @@ -0,0 +1,36 @@ +import asyncio + +from crawlee import SkippedReason +from crawlee.crawlers import ( + BeautifulSoupCrawler, + BeautifulSoupCrawlingContext, +) + + +async def main() -> None: + # Initialize the crawler with robots.txt compliance enabled + crawler = BeautifulSoupCrawler(respect_robots_txt_file=True) + + @crawler.router.default_handler + async def request_handler(context: BeautifulSoupCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # highlight-start + # This handler is called when a request is skipped + @crawler.on_skipped_request + async def skipped_request_handler(url: str, reason: SkippedReason) -> None: + # Check if the request was skipped due to robots.txt rules + if reason == 'robots_txt': + crawler.log.info(f'Skipped {url} due to robots.txt rules.') + + # highlight-end + + # Start the crawler with the specified URLs + # The login URL will be skipped and handled by the skipped_request_handler + await crawler.run( + ['https://news.ycombinator.com/', 'https://news.ycombinator.com/login'] + ) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/examples/code_examples/respect_robots_txt_file.py b/website/versioned_docs/version-1.6/examples/code_examples/respect_robots_txt_file.py new file mode 100644 index 0000000000..ebd63b1c2e --- /dev/null +++ b/website/versioned_docs/version-1.6/examples/code_examples/respect_robots_txt_file.py @@ -0,0 +1,27 @@ +import asyncio + +from crawlee.crawlers import ( + BeautifulSoupCrawler, + BeautifulSoupCrawlingContext, +) + + +async def main() -> None: + # Initialize the crawler with robots.txt compliance enabled + crawler = BeautifulSoupCrawler(respect_robots_txt_file=True) + + @crawler.router.default_handler + async def request_handler(context: BeautifulSoupCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # Start the crawler with the specified URLs + # The crawler will check the robots.txt file before making requests + # In this example, 'https://news.ycombinator.com/login' will be skipped + # because it's disallowed in the site's robots.txt file + await crawler.run( + ['https://news.ycombinator.com/', 'https://news.ycombinator.com/login'] + ) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/examples/code_examples/resuming_paused_crawl.py b/website/versioned_docs/version-1.6/examples/code_examples/resuming_paused_crawl.py new file mode 100644 index 0000000000..e87e428469 --- /dev/null +++ b/website/versioned_docs/version-1.6/examples/code_examples/resuming_paused_crawl.py @@ -0,0 +1,40 @@ +import asyncio + +from crawlee import ConcurrencySettings, service_locator +from crawlee.crawlers import ( + BeautifulSoupCrawler, + BeautifulSoupCrawlingContext, +) + +# Disable clearing the `RequestQueue`, `KeyValueStore` and `Dataset` on each run. +# This makes the scraper continue from where it left off in the previous run. +# The recommended way to achieve this behavior is setting the environment variable +# `CRAWLEE_PURGE_ON_START=0` +configuration = service_locator.get_configuration() +configuration.purge_on_start = False + + +async def main() -> None: + crawler = BeautifulSoupCrawler( + # Let's slow down the crawler for a demonstration + concurrency_settings=ConcurrencySettings(max_tasks_per_minute=20) + ) + + @crawler.router.default_handler + async def request_handler(context: BeautifulSoupCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # List of links for crawl + requests = [ + 'https://crawlee.dev', + 'https://crawlee.dev/python/docs', + 'https://crawlee.dev/python/docs/examples', + 'https://crawlee.dev/python/docs/guides', + 'https://crawlee.dev/python/docs/quick-start', + ] + + await crawler.run(requests) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/examples/code_examples/run_parallel_crawlers.py b/website/versioned_docs/version-1.6/examples/code_examples/run_parallel_crawlers.py new file mode 100644 index 0000000000..5ce94a58fa --- /dev/null +++ b/website/versioned_docs/version-1.6/examples/code_examples/run_parallel_crawlers.py @@ -0,0 +1,94 @@ +import asyncio + +from crawlee import ConcurrencySettings +from crawlee.crawlers import ( + ParselCrawler, + ParselCrawlingContext, + PlaywrightCrawler, + PlaywrightCrawlingContext, +) +from crawlee.sessions import SessionPool +from crawlee.storages import RequestQueue + + +async def main() -> None: + # Open request queues for both crawlers with different aliases + playwright_rq = await RequestQueue.open(alias='playwright-requests') + parsel_rq = await RequestQueue.open(alias='parsel-requests') + + # Use a shared session pool between both crawlers + async with SessionPool() as session_pool: + playwright_crawler = PlaywrightCrawler( + # Set the request queue for Playwright crawler + request_manager=playwright_rq, + session_pool=session_pool, + # Configure concurrency settings for Playwright crawler + concurrency_settings=ConcurrencySettings( + max_concurrency=5, desired_concurrency=5 + ), + # Set `keep_alive`` so that the crawler does not stop working when there are + # no requests in the queue. + keep_alive=True, + ) + + parsel_crawler = ParselCrawler( + # Set the request queue for Parsel crawler + request_manager=parsel_rq, + session_pool=session_pool, + # Configure concurrency settings for Parsel crawler + concurrency_settings=ConcurrencySettings( + max_concurrency=10, desired_concurrency=10 + ), + # Set maximum requests per crawl for Parsel crawler + max_requests_per_crawl=50, + ) + + @playwright_crawler.router.default_handler + async def handle_playwright(context: PlaywrightCrawlingContext) -> None: + context.log.info(f'Playwright Processing {context.request.url}...') + + title = await context.page.title() + # Push the extracted data to the dataset for Playwright crawler + await context.push_data( + {'title': title, 'url': context.request.url, 'source': 'playwright'}, + dataset_name='playwright-data', + ) + + @parsel_crawler.router.default_handler + async def handle_parsel(context: ParselCrawlingContext) -> None: + context.log.info(f'Parsel Processing {context.request.url}...') + + title = context.parsed_content.css('title::text').get() + # Push the extracted data to the dataset for Parsel crawler + await context.push_data( + {'title': title, 'url': context.request.url, 'source': 'parsel'}, + dataset_name='parsel-data', + ) + + # Enqueue links to the Playwright request queue for blog pages + await context.enqueue_links( + selector='a[href*="/blog/"]', rq_alias='playwright-requests' + ) + # Enqueue other links to the Parsel request queue + await context.enqueue_links(selector='a:not([href*="/blog/"])') + + # Start the Playwright crawler in the background + background_crawler_task = asyncio.create_task(playwright_crawler.run([])) + + # Run the Parsel crawler with the initial URL and wait for it to finish + await parsel_crawler.run(['https://crawlee.dev/blog']) + + # Wait for the Playwright crawler to finish processing all requests + while not await playwright_rq.is_empty(): + playwright_crawler.log.info('Waiting for Playwright crawler to finish...') + await asyncio.sleep(5) + + # Stop the Playwright crawler after all requests are processed + playwright_crawler.stop() + + # Wait for the background Playwright crawler task to complete + await background_crawler_task + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/examples/code_examples/using_browser_profiles_chrome.py b/website/versioned_docs/version-1.6/examples/code_examples/using_browser_profiles_chrome.py new file mode 100644 index 0000000000..6831a9b41d --- /dev/null +++ b/website/versioned_docs/version-1.6/examples/code_examples/using_browser_profiles_chrome.py @@ -0,0 +1,54 @@ +import asyncio +import shutil +from pathlib import Path +from tempfile import TemporaryDirectory + +from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext + +# Profile name to use (usually 'Default' for single profile setups) +PROFILE_NAME = 'Default' + +# Paths to Chrome profiles in your system (example for Windows) +# Use `chrome://version/` to find your profile path +PROFILE_PATH = Path(Path.home(), 'AppData', 'Local', 'Google', 'Chrome', 'User Data') + + +async def main() -> None: + # Create a temporary folder to copy the profile to + with TemporaryDirectory(prefix='crawlee-') as tmpdirname: + tmp_profile_dir = Path(tmpdirname) + + # Copy the profile to a temporary folder + shutil.copytree( + PROFILE_PATH / PROFILE_NAME, + tmp_profile_dir / PROFILE_NAME, + dirs_exist_ok=True, + ) + + crawler = PlaywrightCrawler( + headless=False, + # Use the installed Chrome browser + browser_type='chrome', + # Disable fingerprints to preserve profile identity + fingerprint_generator=None, + # Set user data directory to temp folder + user_data_dir=tmp_profile_dir, + browser_launch_options={ + # Slow down actions to mimic human behavior + 'slow_mo': 200, + 'args': [ + # Use the specified profile + f'--profile-directory={PROFILE_NAME}', + ], + }, + ) + + @crawler.router.default_handler + async def default_handler(context: PlaywrightCrawlingContext) -> None: + context.log.info(f'Visiting {context.request.url}') + + await crawler.run(['https://crawlee.dev/']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/examples/code_examples/using_browser_profiles_firefox.py b/website/versioned_docs/version-1.6/examples/code_examples/using_browser_profiles_firefox.py new file mode 100644 index 0000000000..8510269efc --- /dev/null +++ b/website/versioned_docs/version-1.6/examples/code_examples/using_browser_profiles_firefox.py @@ -0,0 +1,42 @@ +import asyncio +from pathlib import Path + +from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext + +# Replace this with your actual Firefox profile name +# Find it at about:profiles in Firefox +PROFILE_NAME = 'your-profile-name-here' + +# Paths to Firefox profiles in your system (example for Windows) +# Use `about:profiles` to find your profile path +PROFILE_PATH = Path( + Path.home(), 'AppData', 'Roaming', 'Mozilla', 'Firefox', 'Profiles', PROFILE_NAME +) + + +async def main() -> None: + crawler = PlaywrightCrawler( + # Use Firefox browser type + browser_type='firefox', + # Disable fingerprints to use the profile as is + fingerprint_generator=None, + headless=False, + # Path to your Firefox profile + user_data_dir=PROFILE_PATH, + browser_launch_options={ + 'args': [ + # Required to avoid version conflicts + '--allow-downgrade' + ] + }, + ) + + @crawler.router.default_handler + async def default_handler(context: PlaywrightCrawlingContext) -> None: + context.log.info(f'Visiting {context.request.url}') + + await crawler.run(['https://crawlee.dev/']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/examples/code_examples/using_sitemap_request_loader.py b/website/versioned_docs/version-1.6/examples/code_examples/using_sitemap_request_loader.py new file mode 100644 index 0000000000..18079c51f2 --- /dev/null +++ b/website/versioned_docs/version-1.6/examples/code_examples/using_sitemap_request_loader.py @@ -0,0 +1,101 @@ +import asyncio +from collections.abc import Callable + +from yarl import URL + +from crawlee import RequestOptions, RequestTransformAction +from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext +from crawlee.http_clients import ImpitHttpClient +from crawlee.request_loaders import SitemapRequestLoader + + +# Create a transform_request_function that maps request options based on the host in +# the URL +def create_transform_request( + data_mapper: dict[str, dict], +) -> Callable[[RequestOptions], RequestOptions | RequestTransformAction]: + def transform_request( + request_options: RequestOptions, + ) -> RequestOptions | RequestTransformAction: + # According to the Sitemap protocol, all URLs in a Sitemap must be from a single + # host. + request_host = URL(request_options['url']).host + + if request_host and (mapping_data := data_mapper.get(request_host)): + # Set properties from the mapping data + if 'label' in mapping_data: + request_options['label'] = mapping_data['label'] + if 'user_data' in mapping_data: + request_options['user_data'] = mapping_data['user_data'] + + return request_options + + return 'unchanged' + + return transform_request + + +async def main() -> None: + # Prepare data mapping for hosts + apify_host = URL('https://apify.com/sitemap.xml').host + crawlee_host = URL('https://crawlee.dev/sitemap.xml').host + + if not apify_host or not crawlee_host: + raise ValueError('Unable to extract host from URLs') + + data_map = { + apify_host: { + 'label': 'apify', + 'user_data': {'source': 'apify'}, + }, + crawlee_host: { + 'label': 'crawlee', + 'user_data': {'source': 'crawlee'}, + }, + } + + # Initialize the SitemapRequestLoader with the transform function + async with SitemapRequestLoader( + # Set the sitemap URLs and the HTTP client + sitemap_urls=['https://crawlee.dev/sitemap.xml', 'https://apify.com/sitemap.xml'], + http_client=ImpitHttpClient(), + transform_request_function=create_transform_request(data_map), + ) as sitemap_loader: + # Convert the sitemap loader to a request manager + request_manager = await sitemap_loader.to_tandem() + + # Create and configure the crawler + crawler = BeautifulSoupCrawler( + request_manager=request_manager, + max_requests_per_crawl=10, + ) + + # Create default handler for requests without a specific label + @crawler.router.default_handler + async def handler(context: BeautifulSoupCrawlingContext) -> None: + source = context.request.user_data.get('source', 'unknown') + context.log.info( + f'Processing request: {context.request.url} from source: {source}' + ) + + # Create handler for requests labeled 'apify' + @crawler.router.handler('apify') + async def apify_handler(context: BeautifulSoupCrawlingContext) -> None: + source = context.request.user_data.get('source', 'unknown') + context.log.info( + f'Apify handler processing: {context.request.url} from source: {source}' + ) + + # Create handler for requests labeled 'crawlee' + @crawler.router.handler('crawlee') + async def crawlee_handler(context: BeautifulSoupCrawlingContext) -> None: + source = context.request.user_data.get('source', 'unknown') + context.log.info( + f'Crawlee handler processing: {context.request.url} from source: {source}' + ) + + await crawler.run() + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/examples/crawl_all_links_on_website.mdx b/website/versioned_docs/version-1.6/examples/crawl_all_links_on_website.mdx new file mode 100644 index 0000000000..f17c63920f --- /dev/null +++ b/website/versioned_docs/version-1.6/examples/crawl_all_links_on_website.mdx @@ -0,0 +1,33 @@ +--- +id: crawl-all-links-on-website +title: Crawl all links on website +--- + +import ApiLink from '@site/src/components/ApiLink'; +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; + +import BeautifulSoupExample from '!!raw-loader!roa-loader!./code_examples/crawl_all_links_on_website_bs.py'; +import PlaywrightExample from '!!raw-loader!roa-loader!./code_examples/crawl_all_links_on_website_pw.py'; + +This example uses the <ApiLink to="class/EnqueueLinksFunction">`enqueue_links`</ApiLink> helper to add new links to the <ApiLink to="class/RequestQueue">`RequestQueue`</ApiLink> as the crawler navigates from page to page. By automatically discovering and enqueuing all links on a given page, the crawler can systematically scrape an entire website. This approach is ideal for web scraping tasks where you need to collect data from multiple interconnected pages. + +:::tip + +If no options are given, by default the method will only add links that are under the same subdomain. This behavior can be controlled with the `strategy` option, which is an instance of the `EnqueueStrategy` type alias. You can find more info about this option in the [Crawl website with relative links](./crawl-website-with-relative-links) example. + +::: + +<Tabs groupId="main"> + <TabItem value="BeautifulSoupCrawler" label="BeautifulSoupCrawler"> + <RunnableCodeBlock className="language-python" language="python"> + {BeautifulSoupExample} + </RunnableCodeBlock> + </TabItem> + <TabItem value="PlaywrightCrawler" label="PlaywrightCrawler"> + <RunnableCodeBlock className="language-python" language="python"> + {PlaywrightExample} + </RunnableCodeBlock> + </TabItem> +</Tabs> diff --git a/website/versioned_docs/version-1.6/examples/crawl_multiple_urls.mdx b/website/versioned_docs/version-1.6/examples/crawl_multiple_urls.mdx new file mode 100644 index 0000000000..2d3d370283 --- /dev/null +++ b/website/versioned_docs/version-1.6/examples/crawl_multiple_urls.mdx @@ -0,0 +1,27 @@ +--- +id: crawl-multiple-urls +title: Crawl multiple URLs +--- + +import ApiLink from '@site/src/components/ApiLink'; +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; + +import BeautifulSoupExample from '!!raw-loader!roa-loader!./code_examples/crawl_multiple_urls_bs.py'; +import PlaywrightExample from '!!raw-loader!roa-loader!./code_examples/crawl_multiple_urls_pw.py'; + +This example demonstrates how to crawl a specified list of URLs using different crawlers. You'll learn how to set up the crawler, define a request handler, and run the crawler with multiple URLs. This setup is useful for scraping data from multiple pages or websites concurrently. + +<Tabs groupId="main"> + <TabItem value="BeautifulSoupCrawler" label="BeautifulSoupCrawler"> + <RunnableCodeBlock className="language-python" language="python"> + {BeautifulSoupExample} + </RunnableCodeBlock> + </TabItem> + <TabItem value="PlaywrightCrawler" label="PlaywrightCrawler"> + <RunnableCodeBlock className="language-python" language="python"> + {PlaywrightExample} + </RunnableCodeBlock> + </TabItem> +</Tabs> diff --git a/website/versioned_docs/version-1.6/examples/crawl_specific_links_on_website.mdx b/website/versioned_docs/version-1.6/examples/crawl_specific_links_on_website.mdx new file mode 100644 index 0000000000..b350568421 --- /dev/null +++ b/website/versioned_docs/version-1.6/examples/crawl_specific_links_on_website.mdx @@ -0,0 +1,47 @@ +--- +id: crawl-specific-links-on-website +title: Crawl specific links on website +--- + +import ApiLink from '@site/src/components/ApiLink'; +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; + +import BeautifulSoupExample from '!!raw-loader!roa-loader!./code_examples/crawl_specific_links_on_website_bs.py'; +import PlaywrightExample from '!!raw-loader!roa-loader!./code_examples/crawl_specific_links_on_website_pw.py'; + +import BeautifulSoupExampleExtractAndAdd from '!!raw-loader!roa-loader!./code_examples/extract_and_add_specific_links_on_website_bs.py'; +import PlaywrightExampleExtractAndAdd from '!!raw-loader!roa-loader!./code_examples/extract_and_add_specific_links_on_website_pw.py'; + +This example demonstrates how to crawl a website while targeting specific patterns of links. By utilizing the <ApiLink to="class/EnqueueLinksFunction">`enqueue_links`</ApiLink> helper, you can pass `include` or `exclude` parameters to improve your crawling strategy. This approach ensures that only the links matching the specified patterns are added to the <ApiLink to="class/RequestQueue">`RequestQueue`</ApiLink>. Both `include` and `exclude` support lists of globs or regular expressions. This functionality is great for focusing on relevant sections of a website and avoiding scraping unnecessary or irrelevant content. + +<Tabs groupId="first-example"> + <TabItem value="BeautifulSoupCrawler" label="BeautifulSoupCrawler"> + <RunnableCodeBlock className="language-python" language="python"> + {BeautifulSoupExample} + </RunnableCodeBlock> + </TabItem> + <TabItem value="PlaywrightCrawler" label="PlaywrightCrawler"> + <RunnableCodeBlock className="language-python" language="python"> + {PlaywrightExample} + </RunnableCodeBlock> + </TabItem> +</Tabs> + +## Even more control over the enqueued links + +<ApiLink to="class/EnqueueLinksFunction">`enqueue_links`</ApiLink> is a convenience helper and internally it calls <ApiLink to="class/ExtractLinksFunction">`extract_links`</ApiLink> to find the links and <ApiLink to="class/AddRequestsFunction">`add_requests`</ApiLink> to add them to the queue. If you need some additional custom filtering of the extracted links before enqueuing them, then consider using <ApiLink to="class/ExtractLinksFunction">`extract_links`</ApiLink> and <ApiLink to="class/AddRequestsFunction">`add_requests`</ApiLink> instead of the <ApiLink to="class/EnqueueLinksFunction">`enqueue_links`</ApiLink> + +<Tabs groupId="second-example"> + <TabItem value="BeautifulSoupCrawler" label="BeautifulSoupCrawler"> + <RunnableCodeBlock className="language-python" language="python"> + {BeautifulSoupExampleExtractAndAdd} + </RunnableCodeBlock> + </TabItem> + <TabItem value="PlaywrightCrawler" label="PlaywrightCrawler"> + <RunnableCodeBlock className="language-python" language="python"> + {PlaywrightExampleExtractAndAdd} + </RunnableCodeBlock> + </TabItem> +</Tabs> diff --git a/website/versioned_docs/version-1.6/examples/crawl_website_with_relative_links.mdx b/website/versioned_docs/version-1.6/examples/crawl_website_with_relative_links.mdx new file mode 100644 index 0000000000..4cf7bee845 --- /dev/null +++ b/website/versioned_docs/version-1.6/examples/crawl_website_with_relative_links.mdx @@ -0,0 +1,52 @@ +--- +id: crawl-website-with-relative-links +title: Crawl website with relative links +--- + +import ApiLink from '@site/src/components/ApiLink'; +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; + +import AllLinksExample from '!!raw-loader!roa-loader!./code_examples/crawl_website_with_relative_links_all_links.py'; +import SameDomainExample from '!!raw-loader!roa-loader!./code_examples/crawl_website_with_relative_links_same_domain.py'; +import SameHostnameExample from '!!raw-loader!roa-loader!./code_examples/crawl_website_with_relative_links_same_hostname.py'; +import SameOriginExample from '!!raw-loader!roa-loader!./code_examples/crawl_website_with_relative_links_same_origin.py'; + +When crawling a website, you may encounter various types of links that you wish to include in your crawl. To facilitate this, we provide the <ApiLink to="class/EnqueueLinksFunction">`enqueue_links`</ApiLink> method on the crawler context, which will automatically find and add these links to the crawler's <ApiLink to="class/RequestQueue">`RequestQueue`</ApiLink>. This method simplifies the process of handling different types of links, including relative links, by automatically resolving them based on the page's context. + +:::note + +For these examples, we are using the <ApiLink to="class/BeautifulSoupCrawler">`BeautifulSoupCrawler`</ApiLink>. However, the same method is available for other crawlers as well. You can use it in exactly the same way. + +::: + +`EnqueueStrategy` type alias provides four distinct strategies for crawling relative links: + +- `all` - Enqueues all links found, regardless of the domain they point to. This strategy is useful when you want to follow every link, including those that navigate to external websites. +- `same-domain` - Enqueues all links found that share the same domain name, including any possible subdomains. This strategy ensures that all links within the same top-level and base domain are included. +- `same-hostname` - Enqueues all links found for the exact same hostname. This is the **default** strategy, and it restricts the crawl to links that have the same hostname as the current page, excluding subdomains. +- `same-origin` - Enqueues all links found that share the same origin. The same origin refers to URLs that share the same protocol, domain, and port, ensuring a strict scope for the crawl. + +<Tabs groupId="main"> + <TabItem value="all_links" label="All links"> + <RunnableCodeBlock className="language-python" language="python"> + {AllLinksExample} + </RunnableCodeBlock> + </TabItem> + <TabItem value="same-domain" label="Same domain"> + <RunnableCodeBlock className="language-python" language="python"> + {SameDomainExample} + </RunnableCodeBlock> + </TabItem> + <TabItem value="same-hostname" label="Same hostname"> + <RunnableCodeBlock className="language-python" language="python"> + {SameHostnameExample} + </RunnableCodeBlock> + </TabItem> + <TabItem value="same-origin" label="Same origin"> + <RunnableCodeBlock className="language-python" language="python"> + {SameOriginExample} + </RunnableCodeBlock> + </TabItem> +</Tabs> diff --git a/website/versioned_docs/version-1.6/examples/crawler_keep_alive.mdx b/website/versioned_docs/version-1.6/examples/crawler_keep_alive.mdx new file mode 100644 index 0000000000..2e6c6640c7 --- /dev/null +++ b/website/versioned_docs/version-1.6/examples/crawler_keep_alive.mdx @@ -0,0 +1,15 @@ +--- +id: crawler-keep-alive +title: Keep a Crawler alive waiting for more requests +--- + +import ApiLink from '@site/src/components/ApiLink'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; + +import BeautifulSoupExample from '!!raw-loader!roa-loader!./code_examples/beautifulsoup_crawler_keep_alive.py'; + +This example demonstrates how to keep crawler alive even when there are no requests at the moment by using `keep_alive=True` argument of <ApiLink to="class/BasicCrawler#__init__">`BasicCrawler.__init__`</ApiLink>. This is available to all crawlers that inherit from <ApiLink to="class/BasicCrawler">`BasicCrawler`</ApiLink> and in the example below it is shown on <ApiLink to="class/BeautifulSoupCrawler">`BeautifulSoupCrawler`</ApiLink>. To stop the crawler that was started with `keep_alive=True` you can call `crawler.stop()`. + +<RunnableCodeBlock className="language-python" language="python"> + {BeautifulSoupExample} +</RunnableCodeBlock> diff --git a/website/versioned_docs/version-1.6/examples/crawler_stop.mdx b/website/versioned_docs/version-1.6/examples/crawler_stop.mdx new file mode 100644 index 0000000000..4ea7f28565 --- /dev/null +++ b/website/versioned_docs/version-1.6/examples/crawler_stop.mdx @@ -0,0 +1,15 @@ +--- +id: crawler-stop +title: Stopping a Crawler with stop method +--- + +import ApiLink from '@site/src/components/ApiLink'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; + +import BeautifulSoupExample from '!!raw-loader!roa-loader!./code_examples/beautifulsoup_crawler_stop.py'; + +This example demonstrates how to use `stop` method of <ApiLink to="class/BasicCrawler">`BasicCrawler`</ApiLink> to stop crawler once the crawler finds what it is looking for. This method is available to all crawlers that inherit from <ApiLink to="class/BasicCrawler">`BasicCrawler`</ApiLink> and in the example below it is shown on <ApiLink to="class/BeautifulSoupCrawler">`BeautifulSoupCrawler`</ApiLink>. Simply call `crawler.stop()` to stop the crawler. It will not continue to crawl through new requests. Requests that are already being concurrently processed are going to get finished. It is possible to call `stop` method with optional argument `reason` that is a string that will be used in logs and it can improve logs readability especially if you have multiple different conditions for triggering `stop`. + +<RunnableCodeBlock className="language-python" language="python"> + {BeautifulSoupExample} +</RunnableCodeBlock> diff --git a/website/versioned_docs/version-1.6/examples/export_entire_dataset_to_file.mdx b/website/versioned_docs/version-1.6/examples/export_entire_dataset_to_file.mdx new file mode 100644 index 0000000000..5cf4a2da77 --- /dev/null +++ b/website/versioned_docs/version-1.6/examples/export_entire_dataset_to_file.mdx @@ -0,0 +1,33 @@ +--- +id: export-entire-dataset-to-file +title: Export entire dataset to file +--- + +import ApiLink from '@site/src/components/ApiLink'; +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; + +import JsonExample from '!!raw-loader!roa-loader!./code_examples/export_entire_dataset_to_file_json.py'; +import CsvExample from '!!raw-loader!roa-loader!./code_examples/export_entire_dataset_to_file_csv.py'; + +This example demonstrates how to use the <ApiLink to="class/BasicCrawler#export_data">`BasicCrawler.export_data`</ApiLink> method of the crawler to export the entire default dataset to a single file. This method supports exporting data in either CSV or JSON format and also accepts additional keyword arguments so you can fine-tune the underlying `json.dump` or `csv.writer` behavior. + +:::note + +For these examples, we are using the <ApiLink to="class/BeautifulSoupCrawler">`BeautifulSoupCrawler`</ApiLink>. However, the same method is available for other crawlers as well. You can use it in exactly the same way. + +::: + +<Tabs groupId="main"> + <TabItem value="json" label="JSON"> + <RunnableCodeBlock className="language-python" language="python"> + {JsonExample} + </RunnableCodeBlock> + </TabItem> + <TabItem value="csv" label="CSV"> + <RunnableCodeBlock className="language-python" language="python"> + {CsvExample} + </RunnableCodeBlock> + </TabItem> +</Tabs> diff --git a/website/versioned_docs/version-1.6/examples/fill_and_submit_web_form.mdx b/website/versioned_docs/version-1.6/examples/fill_and_submit_web_form.mdx new file mode 100644 index 0000000000..841a2616ee --- /dev/null +++ b/website/versioned_docs/version-1.6/examples/fill_and_submit_web_form.mdx @@ -0,0 +1,113 @@ +--- +id: fill-and-submit-web-form +title: Fill and submit web form +--- + +import ApiLink from '@site/src/components/ApiLink'; +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; + +import RequestExample from '!!raw-loader!roa-loader!./code_examples/fill_and_submit_web_form_request.py'; +import CrawlerExample from '!!raw-loader!roa-loader!./code_examples/fill_and_submit_web_form_crawler.py'; + +This example demonstrates how to fill and submit a web form using the <ApiLink to="class/HttpCrawler">`HttpCrawler`</ApiLink> crawler. The same approach applies to any crawler that inherits from it, such as the <ApiLink to="class/BeautifulSoupCrawler">`BeautifulSoupCrawler`</ApiLink> or <ApiLink to="class/ParselCrawler">`ParselCrawler`</ApiLink>. + +We are going to use the [httpbin.org](https://httpbin.org) website to demonstrate how it works. + +## Investigate the form fields + +First, we need to examine the form fields and the form's action URL. You can do this by opening the [httpbin.org/forms/post](https://httpbin.org/forms/post) page in a browser and inspecting the form fields. + +In Chrome, right-click on the page and select "Inspect" or press `Ctrl+Shift+I`. +Use the element selector (`Ctrl+Shift+C`) to click on the form element you want to inspect. + +![HTML input element name](/img/fill-and-submit-web-form/00.jpg 'HTML input element name.') + +Identify the field names. For example, the customer name field is `custname`, the email field is `custemail`, and the phone field is `custtel`. + +Now navigate to the "Network" tab in developer tools and submit the form by clicking the "Submit order" button. + +![Submitting the form](/img/fill-and-submit-web-form/01.jpg 'Submitting the form.') + +Find the form submission request and examine its details. The "Headers" tab will show the submission URL, in this case, it is `https://httpbin.org/post`. + +![Network request investigation](/img/fill-and-submit-web-form/02.jpg 'Network request investigation.') + +The "Payload" tab will display the form fields and their submitted values. This method could be an alternative to inspecting the HTML source code directly. + +![Network payload investigation](/img/fill-and-submit-web-form/03.jpg 'Network payload investigation.') + +## Preparing a POST request + +Now, let's create a POST request with the form fields and their values using the <ApiLink to="class/Request">`Request`</ApiLink> class, specifically its <ApiLink to="class/Request#from_url">`Request.from_url`</ApiLink> constructor: + +<RunnableCodeBlock className="language-python" language="python"> + {RequestExample} +</RunnableCodeBlock> + +Alternatively, you can send form data as URL parameters using the `url` argument. It depends on the form and how it is implemented. However, sending the data as a POST request body using the `payload` is generally a better approach. + +## Implementing the crawler + +Finally, let's implement the crawler and run it with the prepared request. Although we are using the <ApiLink to="class/HttpCrawler">`HttpCrawler`</ApiLink>, the process is the same for any crawler that inherits from it. + +<RunnableCodeBlock className="language-python" language="python"> + {CrawlerExample} +</RunnableCodeBlock> + +## Running the crawler + +Finally, run your crawler. Your logs should show something like this: + +```plaintext +... +[crawlee.http_crawler._http_crawler] INFO Processing https://httpbin.org/post ... +[crawlee.http_crawler._http_crawler] INFO Response: { + "args": {}, + "data": "", + "files": {}, + "form": { + "comments": "Please ring the doorbell upon arrival.", + "custemail": "johndoe@example.com", + "custname": "John Doe", + "custtel": "1234567890", + "delivery": "13:00", + "size": "large", + "topping": [ + "bacon", + "cheese", + "mushroom" + ] + }, + "headers": { + "Accept": "*/*", + "Accept-Encoding": "gzip, deflate, br", + "Content-Length": "190", + "Content-Type": "application/x-www-form-urlencoded", + "Host": "httpbin.org", + "User-Agent": "python-httpx/0.27.0", + "X-Amzn-Trace-Id": "Root=1-66c849d6-1ae432fb7b4156e6149ff37f" + }, + "json": null, + "origin": "78.80.81.196", + "url": "https://httpbin.org/post" +} + +[crawlee._autoscaling.autoscaled_pool] INFO Waiting for remaining tasks to finish +[crawlee.http_crawler._http_crawler] INFO Final request statistics: +โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” +โ”‚ requests_finished โ”‚ 1 โ”‚ +โ”‚ requests_failed โ”‚ 0 โ”‚ +โ”‚ retry_histogram โ”‚ [1] โ”‚ +โ”‚ request_avg_failed_duration โ”‚ None โ”‚ +โ”‚ request_avg_finished_duration โ”‚ 0.678442 โ”‚ +โ”‚ requests_finished_per_minute โ”‚ 85 โ”‚ +โ”‚ requests_failed_per_minute โ”‚ 0 โ”‚ +โ”‚ request_total_duration โ”‚ 0.678442 โ”‚ +โ”‚ requests_total โ”‚ 1 โ”‚ +โ”‚ crawler_runtime โ”‚ 0.707666 โ”‚ +โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ +``` + +This log output confirms that the crawler successfully submitted the form and processed the response. Congratulations! You have successfully filled and submitted a web form using the <ApiLink to="class/HttpCrawler">`HttpCrawler`</ApiLink>. diff --git a/website/versioned_docs/version-1.6/examples/json_logging.mdx b/website/versioned_docs/version-1.6/examples/json_logging.mdx new file mode 100644 index 0000000000..06dd2ac492 --- /dev/null +++ b/website/versioned_docs/version-1.6/examples/json_logging.mdx @@ -0,0 +1,57 @@ +--- +id: configure-json-logging +title: ะกonfigure JSON logging +--- + +import ApiLink from '@site/src/components/ApiLink'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; + +import JsonLoggingExample from '!!raw-loader!roa-loader!./code_examples/configure_json_logging.py'; + +This example demonstrates how to configure JSON line (JSONL) logging with Crawlee. By using the `use_table_logs=False` parameter, you can disable table-formatted statistics logs, which makes it easier to parse logs with external tools or to serialize them as JSON. + +The example shows how to integrate with the popular [`loguru`](https://github.com/delgan/loguru) library to capture Crawlee logs and format them as JSONL (one JSON object per line). This approach works well when you need to collect logs for analysis, monitoring, or when integrating with logging platforms like ELK Stack, Grafana Loki, or similar systems. + +<RunnableCodeBlock className="language-python" language="python"> + {JsonLoggingExample} +</RunnableCodeBlock> + +Here's an example of what a crawler statistics log entry in JSONL format. + +```json +{ + "text": "[HttpCrawler] | INFO | - Final request statistics: {'requests_finished': 1, 'requests_failed': 0, 'retry_histogram': [1], 'request_avg_failed_duration': None, 'request_avg_finished_duration': 3.57098, 'requests_finished_per_minute': 17, 'requests_failed_per_minute': 0, 'request_total_duration': 3.57098, 'requests_total': 1, 'crawler_runtime': 3.59165}\n", + "record": { + "elapsed": { "repr": "0:00:05.604568", "seconds": 5.604568 }, + "exception": null, + "extra": { + "requests_finished": 1, + "requests_failed": 0, + "retry_histogram": [1], + "request_avg_failed_duration": null, + "request_avg_finished_duration": 3.57098, + "requests_finished_per_minute": 17, + "requests_failed_per_minute": 0, + "request_total_duration": 3.57098, + "requests_total": 1, + "crawler_runtime": 3.59165 + }, + "file": { + "name": "_basic_crawler.py", + "path": "/crawlers/_basic/_basic_crawler.py" + }, + "function": "run", + "level": { "icon": "โ„น๏ธ", "name": "INFO", "no": 20 }, + "line": 583, + "message": "Final request statistics:", + "module": "_basic_crawler", + "name": "HttpCrawler", + "process": { "id": 198383, "name": "MainProcess" }, + "thread": { "id": 135312814966592, "name": "MainThread" }, + "time": { + "repr": "2025-03-17 17:14:45.339150+00:00", + "timestamp": 1742231685.33915 + } + } +} +``` diff --git a/website/versioned_docs/version-1.6/examples/parsel_crawler.mdx b/website/versioned_docs/version-1.6/examples/parsel_crawler.mdx new file mode 100644 index 0000000000..b0eca7eb28 --- /dev/null +++ b/website/versioned_docs/version-1.6/examples/parsel_crawler.mdx @@ -0,0 +1,15 @@ +--- +id: parsel-crawler +title: Parsel crawler +--- + +import ApiLink from '@site/src/components/ApiLink'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; + +import ParselCrawlerExample from '!!raw-loader!roa-loader!./code_examples/parsel_crawler.py'; + +This example shows how to use <ApiLink to="class/ParselCrawler">`ParselCrawler`</ApiLink> to crawl a website or a list of URLs. Each URL is loaded using a plain HTTP request and the response is parsed using [Parsel](https://pypi.org/project/parsel/) library which supports CSS and XPath selectors for HTML responses and JMESPath for JSON responses. We can extract data from all kinds of complex HTML structures using XPath. In this example, we will use Parsel to crawl github.com and extract page title, URL and emails found in the webpage. The default handler will scrape data from the current webpage and enqueue all the links found in the webpage for continuous scraping. It also shows how you can add optional pre-navigation hook to the crawler. Pre-navigation hooks are user defined functions that execute before sending the request. + +<RunnableCodeBlock className="language-python" language="python"> + {ParselCrawlerExample} +</RunnableCodeBlock> diff --git a/website/versioned_docs/version-1.6/examples/playwright_crawler.mdx b/website/versioned_docs/version-1.6/examples/playwright_crawler.mdx new file mode 100644 index 0000000000..70b0bc8afb --- /dev/null +++ b/website/versioned_docs/version-1.6/examples/playwright_crawler.mdx @@ -0,0 +1,19 @@ +--- +id: playwright-crawler +title: Playwright crawler +--- + +import ApiLink from '@site/src/components/ApiLink'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; + +import PlaywrightCrawlerExample from '!!raw-loader!roa-loader!./code_examples/playwright_crawler.py'; + +This example demonstrates how to use <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> to recursively scrape the Hacker news website using headless Chromium and Playwright. + +The <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> manages the browser and page instances, simplifying the process of interacting with web pages. In the request handler, Playwright's API is used to extract data from each post on the page. Specifically, it retrieves the title, rank, and URL of each post. Additionally, the handler enqueues links to the next pages to ensure continuous scraping. This setup is ideal for scraping dynamic web pages where JavaScript execution is required to render the content. + +A **pre-navigation hook** can be used to perform actions before navigating to the URL. This hook provides further flexibility in controlling environment and preparing for navigation. + +<RunnableCodeBlock className="language-python" language="python"> + {PlaywrightCrawlerExample} +</RunnableCodeBlock> diff --git a/website/versioned_docs/version-1.6/examples/playwright_crawler_adaptive.mdx b/website/versioned_docs/version-1.6/examples/playwright_crawler_adaptive.mdx new file mode 100644 index 0000000000..f915f0246f --- /dev/null +++ b/website/versioned_docs/version-1.6/examples/playwright_crawler_adaptive.mdx @@ -0,0 +1,20 @@ +--- +id: adaptive-playwright-crawler +title: Adaptive Playwright crawler +--- + +import ApiLink from '@site/src/components/ApiLink'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; + +import AdaptivePlaywrightCrawlerExample from '!!raw-loader!roa-loader!./code_examples/adaptive_playwright_crawler.py'; + +This example demonstrates how to use <ApiLink to="class/AdaptivePlaywrightCrawler">`AdaptivePlaywrightCrawler`</ApiLink>. An <ApiLink to="class/AdaptivePlaywrightCrawler">`AdaptivePlaywrightCrawler`</ApiLink> is a combination of <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> and some implementation of HTTP-based crawler such as <ApiLink to="class/ParselCrawler">`ParselCrawler`</ApiLink> or <ApiLink to="class/BeautifulSoupCrawler">`BeautifulSoupCrawler`</ApiLink>. +It uses a more limited crawling context interface so that it is able to switch to HTTP-only crawling when it detects that it may bring a performance benefit. + +A [pre-navigation hook](/python/docs/guides/adaptive-playwright-crawler#page-configuration-with-pre-navigation-hooks) can be used to perform actions before navigating to the URL. This hook provides further flexibility in controlling environment and preparing for navigation. Hooks will be executed both for the pages crawled by HTTP-bases sub crawler and playwright based sub crawler. Use `playwright_only=True` to mark hooks that should be executed only for playwright sub crawler. + +For more detailed description please see [Adaptive Playwright crawler guide](/python/docs/guides/adaptive-playwright-crawler) + +<RunnableCodeBlock className="language-python" language="python"> + {AdaptivePlaywrightCrawlerExample} +</RunnableCodeBlock> diff --git a/website/versioned_docs/version-1.6/examples/playwright_crawler_with_block_requests.mdx b/website/versioned_docs/version-1.6/examples/playwright_crawler_with_block_requests.mdx new file mode 100644 index 0000000000..d7d5e15928 --- /dev/null +++ b/website/versioned_docs/version-1.6/examples/playwright_crawler_with_block_requests.mdx @@ -0,0 +1,27 @@ +--- +id: playwright-crawler-with-block-requests +title: Playwright crawler with block requests +--- + +import ApiLink from '@site/src/components/ApiLink'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; + +import PlaywrightBlockRequests from '!!raw-loader!roa-loader!./code_examples/playwright_block_requests.py'; + +This example demonstrates how to optimize your <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> performance by blocking unnecessary network requests. + +The primary use case is when you need to scrape or interact with web pages without loading non-essential resources like images, styles, or analytics scripts. This can significantly reduce bandwidth usage and improve crawling speed. + +The <ApiLink to="class/BlockRequestsFunction">`block_requests`</ApiLink> helper provides the most efficient way to block requests as it operates directly in the browser. + +By default, <ApiLink to="class/BlockRequestsFunction">`block_requests`</ApiLink> will block all URLs including the following patterns: + +```python +['.css', '.webp', '.jpg', '.jpeg', '.png', '.svg', '.gif', '.woff', '.pdf', '.zip'] +``` + +You can also replace the default patterns list with your own by providing `url_patterns`, or extend it by passing additional patterns in `extra_url_patterns`. + +<RunnableCodeBlock className="language-python" language="python"> + {PlaywrightBlockRequests} +</RunnableCodeBlock> diff --git a/website/versioned_docs/version-1.6/examples/playwright_crawler_with_camoufox.mdx b/website/versioned_docs/version-1.6/examples/playwright_crawler_with_camoufox.mdx new file mode 100644 index 0000000000..b627c9ba34 --- /dev/null +++ b/website/versioned_docs/version-1.6/examples/playwright_crawler_with_camoufox.mdx @@ -0,0 +1,26 @@ +--- +id: playwright-crawler-with-camoufox +title: Playwright crawler with Camoufox +--- + +import ApiLink from '@site/src/components/ApiLink'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; + +import PlaywrightCrawlerExampleWithCamoufox from '!!raw-loader!roa-loader!./code_examples/playwright_crawler_with_camoufox.py'; + +This example demonstrates how to integrate Camoufox into <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> using <ApiLink to="class/BrowserPool">`BrowserPool`</ApiLink> with custom <ApiLink to="class/PlaywrightBrowserPlugin">`PlaywrightBrowserPlugin`</ApiLink>. + +Camoufox is a stealthy minimalistic build of Firefox. For details please visit its homepage https://camoufox.com/ . +To be able to run this example you will need to install camoufox, as it is external tool, and it is not part of the crawlee. For installation please see https://pypi.org/project/camoufox/. + +**Warning!** Camoufox is using custom build of firefox. This build can be hundreds of MB large. +You can either pre-download this file using following command `python3 -m camoufox fetch` or camoufox will download it automatically once you try to run it, and it does not find existing binary. +For more details please refer to: https://github.com/daijro/camoufox/tree/main/pythonlib#camoufox-python-interface + +**Project template -** It is possible to generate project with Python code which includes Camoufox integration into crawlee through crawlee cli. Call `crawlee create` and pick `Playwright-camoufox` when asked for Crawler type. + +The example code after PlayWrightCrawler instantiation is similar to example describing the use of Playwright Crawler. The main difference is that in this example Camoufox will be used as the browser through BrowserPool. + +<RunnableCodeBlock className="language-python" language="python"> + {PlaywrightCrawlerExampleWithCamoufox} +</RunnableCodeBlock> diff --git a/website/versioned_docs/version-1.6/examples/playwright_crawler_with_fingerprint_generator.mdx b/website/versioned_docs/version-1.6/examples/playwright_crawler_with_fingerprint_generator.mdx new file mode 100644 index 0000000000..04727cd74c --- /dev/null +++ b/website/versioned_docs/version-1.6/examples/playwright_crawler_with_fingerprint_generator.mdx @@ -0,0 +1,17 @@ +--- +id: playwright-crawler-with-fingerprint-generator +title: Playwright crawler with fingerprint generator +--- + +import ApiLink from '@site/src/components/ApiLink'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; + +import PlaywrightCrawlerExample from '!!raw-loader!roa-loader!./code_examples/playwright_crawler_with_fingerprint_generator.py'; + +This example demonstrates how to use <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> together with <ApiLink to="class/FingerprintGenerator">`FingerprintGenerator`</ApiLink> that will populate several browser attributes to mimic real browser fingerprint. To read more about fingerprints please see: https://docs.apify.com/academy/anti-scraping/techniques/fingerprinting. + +You can implement your own fingerprint generator or use <ApiLink to="class/BrowserforgeFingerprintGenerator">`DefaultFingerprintGenerator`</ApiLink>. To use the generator initialize it with the desired fingerprint options. The generator will try to create fingerprint based on those options. Unspecified options will be automatically selected by the generator from the set of reasonable values. If some option is important for you, do not rely on the default and explicitly define it. + +<RunnableCodeBlock className="language-python" language="python"> + {PlaywrightCrawlerExample} +</RunnableCodeBlock> diff --git a/website/versioned_docs/version-1.6/examples/respect_robots_txt_file.mdx b/website/versioned_docs/version-1.6/examples/respect_robots_txt_file.mdx new file mode 100644 index 0000000000..dc509e16b8 --- /dev/null +++ b/website/versioned_docs/version-1.6/examples/respect_robots_txt_file.mdx @@ -0,0 +1,32 @@ +--- +id: respect-robots-txt-file +title: Respect robots.txt file +--- + +import ApiLink from '@site/src/components/ApiLink'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; + +import RespectRobotsTxt from '!!raw-loader!roa-loader!./code_examples/respect_robots_txt_file.py'; +import OnSkippedRequest from '!!raw-loader!roa-loader!./code_examples/respect_robots_on_skipped_request.py'; + +This example demonstrates how to configure your crawler to respect the rules established by websites for crawlers as described in the [robots.txt](https://www.robotstxt.org/robotstxt.html) file. + +To configure `Crawlee` to follow the `robots.txt` file, set the parameter `respect_robots_txt_file=True` in <ApiLink to="class/BasicCrawlerOptions">`BasicCrawlerOptions`</ApiLink>. In this case, `Crawlee` will skip any URLs forbidden in the website's robots.txt file. + +As an example, let's look at the website `https://news.ycombinator.com/` and its corresponding [robots.txt](https://news.ycombinator.com/robots.txt) file. Since the file has a rule `Disallow: /login`, the URL `https://news.ycombinator.com/login` will be automatically skipped. + +The code below demonstrates this behavior using the <ApiLink to="class/BeautifulSoupCrawler">`BeautifulSoupCrawler`</ApiLink>: + +<RunnableCodeBlock className="language-python" language="python"> + {RespectRobotsTxt} +</RunnableCodeBlock> + +## Handle with `on_skipped_request` + +If you want to process URLs skipped according to the `robots.txt` rules, for example for further analysis, you should use the `on_skipped_request` handler from <ApiLink to="class/BasicCrawler#on_skipped_request">`BasicCrawler`</ApiLink>. + +Let's update the code by adding the `on_skipped_request` handler: + +<RunnableCodeBlock className="language-python" language="python"> + {OnSkippedRequest} +</RunnableCodeBlock> diff --git a/website/versioned_docs/version-1.6/examples/resuming_paused_crawl.mdx b/website/versioned_docs/version-1.6/examples/resuming_paused_crawl.mdx new file mode 100644 index 0000000000..8d2213d11d --- /dev/null +++ b/website/versioned_docs/version-1.6/examples/resuming_paused_crawl.mdx @@ -0,0 +1,35 @@ +--- +id: resuming-paused-crawl +title: Resuming a paused crawl +--- + +import ApiLink from '@site/src/components/ApiLink'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; + +import ResumeCrawl from '!!raw-loader!roa-loader!./code_examples/resuming_paused_crawl.py'; + +This example demonstrates how to resume crawling from its last state when running locally, if for some reason it was unexpectedly terminated. + +If each run should continue crawling from the previous state, you can configure this using `purge_on_start` in <ApiLink to="class/Configuration">`Configuration`</ApiLink>. + +Use the code below and perform 2 sequential runs. During the 1st run, stop the crawler by pressing `CTRL+C`, and the 2nd run will resume crawling from where it stopped. + +<RunnableCodeBlock className="language-python" language="python"> + {ResumeCrawl} +</RunnableCodeBlock> + +Perform the 1st run, interrupting the crawler with `CTRL+C` after 2 links have been processed. + +![Run with interruption](/img/resuming-paused-crawl/00.webp 'Run with interruption.') + +Now resume crawling after the pause to process the remaining 3 links. + +![Resuming crawling](/img/resuming-paused-crawl/01.webp 'Resuming crawling.') + +Alternatively, use the environment variable `CRAWLEE_PURGE_ON_START=0` instead of using `configuration.purge_on_start = False`. + +For example, when running code: + +```bash +CRAWLEE_PURGE_ON_START=0 python -m best_crawler +``` diff --git a/website/versioned_docs/version-1.6/examples/run_parallel_crawlers.mdx b/website/versioned_docs/version-1.6/examples/run_parallel_crawlers.mdx new file mode 100644 index 0000000000..fba5c437b7 --- /dev/null +++ b/website/versioned_docs/version-1.6/examples/run_parallel_crawlers.mdx @@ -0,0 +1,19 @@ +--- +id: run-parallel-crawlers +title: Run parallel crawlers +--- + +import ApiLink from '@site/src/components/ApiLink'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; + +import RunParallelCrawlersExample from '!!raw-loader!roa-loader!./code_examples/run_parallel_crawlers.py'; + +This example demonstrates how to run two parallel crawlers where one crawler processes links discovered by another crawler. + +In some situations, you may need different approaches for scraping data from a website. For example, you might use <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> for navigating JavaScript-heavy pages and a faster, more lightweight <ApiLink to="class/ParselCrawler">`ParselCrawler`</ApiLink> for processing static pages. One way to solve this is to use <ApiLink to="class/AdaptivePlaywrightCrawler">`AdaptivePlaywrightCrawler`</ApiLink>, see the [Adaptive Playwright crawler example](./adaptive-playwright-crawler) to learn more. + +The code below demonstrates an alternative approach using two separate crawlers. Links are passed between crawlers via <ApiLink to="class/RequestQueue">`RequestQueue`</ApiLink> aliases. The `keep_alive` option allows the Playwright crawler to run in the background and wait for incoming links without stopping when its queue is empty. You can also use different storage clients for each crawler without losing the ability to pass links between queues. Learn more about available storage clients in this [guide](/python/docs/guides/storage-clients). + +<RunnableCodeBlock className="language-python" language="python"> + {RunParallelCrawlersExample} +</RunnableCodeBlock> diff --git a/website/versioned_docs/version-1.6/examples/using_browser_profile.mdx b/website/versioned_docs/version-1.6/examples/using_browser_profile.mdx new file mode 100644 index 0000000000..8eda2554a4 --- /dev/null +++ b/website/versioned_docs/version-1.6/examples/using_browser_profile.mdx @@ -0,0 +1,39 @@ +--- +id: using_browser_profile +title: Using browser profile +--- + +import ApiLink from '@site/src/components/ApiLink'; + +import CodeBlock from '@theme/CodeBlock'; + +import ChromeProfileExample from '!!raw-loader!./code_examples/using_browser_profiles_chrome.py'; +import FirefoxProfileExample from '!!raw-loader!./code_examples/using_browser_profiles_firefox.py'; + +This example demonstrates how to run <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> using your local browser profile from [Chrome](https://www.google.com/intl/us/chrome/) or [Firefox](https://www.firefox.com/). + +Using browser profiles allows you to leverage existing login sessions, saved passwords, bookmarks, and other personalized browser data during crawling. This can be particularly useful for testing scenarios or when you need to access content that requires authentication. + +## Chrome browser + +To run <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> with your Chrome profile, you need to know the path to your profile files. You can find this information by entering `chrome://version/` as a URL in your Chrome browser. If you have multiple profiles, pay attention to the profile name - if you only have one profile, it's always `Default`. + +:::warning Profile access limitation +Due to [Chrome's security policies](https://developer.chrome.com/blog/remote-debugging-port), automation cannot use your main browsing profile directly. The example copies your profile to a temporary location as a workaround. +::: + +Make sure you don't have any running Chrome browser processes before running this code: + +<CodeBlock className="language-python" language="python"> + {ChromeProfileExample} +</CodeBlock> + +## Firefox browser + +To find the path to your Firefox profile, enter `about:profiles` as a URL in your Firefox browser. Unlike Chrome, you can use your standard profile path directly without copying it first. + +Make sure you don't have any running Firefox browser processes before running this code: + +<CodeBlock className="language-python" language="python"> + {FirefoxProfileExample} +</CodeBlock> diff --git a/website/versioned_docs/version-1.6/examples/using_sitemap_request_loader.mdx b/website/versioned_docs/version-1.6/examples/using_sitemap_request_loader.mdx new file mode 100644 index 0000000000..3ed528e94e --- /dev/null +++ b/website/versioned_docs/version-1.6/examples/using_sitemap_request_loader.mdx @@ -0,0 +1,22 @@ +--- +id: using-sitemap-request-loader +title: Using sitemap request loader +--- + +import ApiLink from '@site/src/components/ApiLink'; + +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; + +import SitemapRequestLoaderExample from '!!raw-loader!roa-loader!./code_examples/using_sitemap_request_loader.py'; + +This example demonstrates how to use <ApiLink to="class/SitemapRequestLoader">`SitemapRequestLoader`</ApiLink> to crawl websites that provide `sitemap.xml` files following the [Sitemaps protocol](https://www.sitemaps.org/protocol.html). The <ApiLink to="class/SitemapRequestLoader">`SitemapRequestLoader`</ApiLink> processes sitemaps in a streaming fashion without loading them entirely into memory, making it suitable for large sitemaps. + +The example shows how to use the `transform_request_function` parameter to configure request options based on URL patterns. This allows you to modify request properties such as labels and user data based on the source URL, enabling different handling logic for different websites or sections. + +The following code example implements processing of sitemaps from two different domains (Apify and Crawlee), with different labels assigned to requests based on their host. The `create_transform_request` function maps each host to the corresponding request configuration, while the crawler uses different handlers based on the assigned labels. + +<RunnableCodeBlock className="language-python" language="python"> + {SitemapRequestLoaderExample} +</RunnableCodeBlock> + +For more information about request loaders, see the [Request loaders guide](../guides/request-loaders). diff --git a/website/versioned_docs/version-1.6/guides/architecture_overview.mdx b/website/versioned_docs/version-1.6/guides/architecture_overview.mdx new file mode 100644 index 0000000000..0f1b235b60 --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/architecture_overview.mdx @@ -0,0 +1,407 @@ +--- +id: architecture-overview +title: Architecture overview +description: An overview of the core components of the Crawlee library and its architecture. +--- + +import ApiLink from '@site/src/components/ApiLink'; + +Crawlee is a modern and modular web scraping framework. It is designed for both HTTP-only and browser-based scraping. In this guide, we will provide a high-level overview of its architecture and the main components that make up the system. + +## Crawler + +The main user-facing component of Crawlee is the crawler, which orchestrates the crawling process and takes care of all other components. It manages storages, executes user-defined request handlers, handles retries, manages concurrency, and coordinates all other components. All crawlers inherit from the <ApiLink to="class/BasicCrawler">`BasicCrawler`</ApiLink> class, which provides the basic functionality. There are two main groups of specialized crawlers: HTTP crawlers and browser crawlers. + +:::info + +You will learn more about the request handlers in the request router section. + +::: + +```mermaid +--- +config: + class: + hideEmptyMembersBox: true +--- + +classDiagram + +%% ======================== +%% Abstract classes +%% ======================== + +class BasicCrawler { + <<abstract>> +} + +class AbstractHttpCrawler { + <<abstract>> +} + +%% ======================== +%% Specific classes +%% ======================== + +class HttpCrawler + +class ParselCrawler + +class BeautifulSoupCrawler + +class PlaywrightCrawler + +class AdaptivePlaywrightCrawler + +%% ======================== +%% Inheritance arrows +%% ======================== + +BasicCrawler --|> AbstractHttpCrawler +BasicCrawler --|> PlaywrightCrawler +BasicCrawler --|> AdaptivePlaywrightCrawler +AbstractHttpCrawler --|> HttpCrawler +AbstractHttpCrawler --|> ParselCrawler +AbstractHttpCrawler --|> BeautifulSoupCrawler +``` + +### HTTP crawlers + +HTTP crawlers use HTTP clients to fetch pages and parse them with HTML parsing libraries. They are fast and efficient for sites that do not require JavaScript rendering. HTTP clients are Crawlee components that wrap around HTTP libraries like [httpx](https://www.python-httpx.org/), [curl-impersonate](https://github.com/lwthiker/curl-impersonate) or [impit](https://apify.github.io/impit) and handle HTTP communication for requests and responses. You can learn more about them in the [HTTP clients guide](./http-clients). + +HTTP crawlers inherit from <ApiLink to="class/AbstractHttpCrawler">`AbstractHttpCrawler`</ApiLink> and there are three crawlers that belong to this category: + +- <ApiLink to="class/BeautifulSoupCrawler">`BeautifulSoupCrawler`</ApiLink> utilizes the [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/) HTML parser. +- <ApiLink to="class/ParselCrawler">`ParselCrawler`</ApiLink> utilizes [Parsel](https://github.com/scrapy/parsel) for parsing HTML. +- <ApiLink to="class/HttpCrawler">`HttpCrawler`</ApiLink> does not parse HTTP responses at all and is used when no content parsing is required. + +You can learn more about HTTP crawlers in the [HTTP crawlers guide](./http-crawlers). + +### Browser crawlers + +Browser crawlers use a real browser to render pages, enabling scraping of sites that require JavaScript. They manage browser instances, pages, and context lifecycles. Currently, the only browser crawler is <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink>, which utilizes the [Playwright](https://playwright.dev/) library. Playwright provides a high-level API for controlling and navigating browsers. You can learn more about <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink>, its features, and how it internally manages browser instances in the [Playwright crawler guide](./playwright-crawler). + +### Adaptive crawler + +The <ApiLink to="class/AdaptivePlaywrightCrawler">`AdaptivePlaywrightCrawler`</ApiLink> sits between HTTP and browser crawlers. It can automatically decide whether to use HTTP or browser crawling for each request based on heuristics or user configuration. This allows for optimal performance and compatibility. It also provides a uniform interface for both crawling types (modes). You can learn more about adaptive crawling in the [Adaptive Playwright crawler guide](./adaptive-playwright-crawler). + +## Crawling contexts + +Crawling contexts are objects that encapsulate the state and data for each request being processed by the crawler. They provide access to the request, response, session, and helper methods for handling the request. Crawling contexts are used to pass data between different parts of the crawler and to manage the lifecycle of each request. These contexts are provided to user-defined request handlers, which can then use them to access request data, response data, or use helper methods to interact with storages, and extract and enqueue new requests. + +```mermaid +--- +config: + class: + hideEmptyMembersBox: true +--- + +classDiagram + +%% ======================== +%% Classes +%% ======================== + +class BasicCrawlingContext + +class HttpCrawlingContext + +class HttpCrawlingResult + +class ParsedHttpCrawlingContext + +class ParselCrawlingContext + +class BeautifulSoupCrawlingContext + +class PlaywrightPreNavCrawlingContext + +class PlaywrightCrawlingContext + +class AdaptivePlaywrightPreNavCrawlingContext + +class AdaptivePlaywrightCrawlingContext + +%% ======================== +%% Inheritance arrows +%% ======================== + +BasicCrawlingContext --|> HttpCrawlingContext + +HttpCrawlingResult --|> HttpCrawlingContext + +HttpCrawlingContext --|> ParsedHttpCrawlingContext + +ParsedHttpCrawlingContext --|> ParselCrawlingContext + +ParsedHttpCrawlingContext --|> BeautifulSoupCrawlingContext + +BasicCrawlingContext --|> PlaywrightPreNavCrawlingContext + +PlaywrightPreNavCrawlingContext --|> PlaywrightCrawlingContext + +BasicCrawlingContext --|> AdaptivePlaywrightPreNavCrawlingContext + +ParsedHttpCrawlingContext --|> AdaptivePlaywrightCrawlingContext +``` + +They have a similar inheritance structure as the crawlers, with the base class being <ApiLink to="class/BasicCrawlingContext">`BasicCrawlingContext`</ApiLink>. The specific crawling contexts are: +- <ApiLink to="class/HttpCrawlingContext">`HttpCrawlingContext`</ApiLink> for HTTP crawlers. +- <ApiLink to="class/ParsedHttpCrawlingContext">`ParsedHttpCrawlingContext`</ApiLink> for HTTP crawlers with parsed responses. +- <ApiLink to="class/ParselCrawlingContext">`ParselCrawlingContext`</ApiLink> for HTTP crawlers that use [Parsel](https://github.com/scrapy/parsel) for parsing. +- <ApiLink to="class/BeautifulSoupCrawlingContext">`BeautifulSoupCrawlingContext`</ApiLink> for HTTP crawlers that use [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/) for parsing. +- <ApiLink to="class/PlaywrightPreNavCrawlingContext">`PlaywrightPreNavCrawlingContext`</ApiLink> for Playwright crawlers before the page is navigated. +- <ApiLink to="class/PlaywrightCrawlingContext">`PlaywrightCrawlingContext`</ApiLink> for Playwright crawlers. +- <ApiLink to="class/AdaptivePlaywrightPreNavCrawlingContext">`AdaptivePlaywrightPreNavCrawlingContext`</ApiLink> for Adaptive Playwright crawlers before the page is navigated. +- <ApiLink to="class/AdaptivePlaywrightCrawlingContext">`AdaptivePlaywrightCrawlingContext`</ApiLink> for Adaptive Playwright crawlers. + +## Storages + +Storages are the components that manage data in Crawlee. They provide a way to store and retrieve data during the crawling process. Crawlee's storage system consists of two main layers: + +- **Storages**: High-level interfaces for interacting with different storage types +- **Storage clients**: Backend implementations that handle the actual data persistence and management (you will learn more about them in the next section) + +Crawlee provides three built-in storage types for managing data: + +- <ApiLink to="class/Dataset">`Dataset`</ApiLink> - Append-only, tabular storage for structured data. It is ideal for storing scraping results. +- <ApiLink to="class/KeyValueStore">`KeyValueStore`</ApiLink> - Storage for arbitrary data like JSON documents, images or configs. It supports get and set operations with key-value pairs; updates are only possible by replacement. +- <ApiLink to="class/RequestQueue">`RequestQueue`</ApiLink> - A managed queue for pending and completed requests, with automatic deduplication and dynamic addition of new items. It is used to track URLs for crawling. + +See the [Storages guide](./storages) for more details. + +```mermaid +--- +config: + class: + hideEmptyMembersBox: true +--- + +classDiagram + +%% ======================== +%% Abstract classes +%% ======================== + +class Storage { + <<abstract>> +} + +%% ======================== +%% Specific classes +%% ======================== + +class Dataset + +class KeyValueStore + +class RequestQueue + +%% ======================== +%% Inheritance arrows +%% ======================== + +Storage --|> Dataset +Storage --|> KeyValueStore +Storage --|> RequestQueue +``` + +## Storage clients + +Storage clients are the backend implementations for storages that handle interactions with different storage systems. They provide a unified interface for <ApiLink to="class/Dataset">`Dataset`</ApiLink>, <ApiLink to="class/KeyValueStore">`KeyValueStore`</ApiLink>, and <ApiLink to="class/RequestQueue">`RequestQueue`</ApiLink>, regardless of the underlying storage implementation. + +Crawlee provides several built-in storage client implementations: + +- <ApiLink to="class/MemoryStorageClient">`MemoryStorageClient`</ApiLink> - Stores data in memory with no persistence (ideal for testing and fast operations). +- <ApiLink to="class/FileSystemStorageClient">`FileSystemStorageClient`</ApiLink> - Provides persistent file system storage with caching (default client). +- [`ApifyStorageClient`](https://docs.apify.com/sdk/python/reference/class/ApifyStorageClient) - Manages storage on the [Apify platform](https://apify.com/) (cloud-based). It is implemented in the [Apify SDK](https://github.com/apify/apify-sdk-python). You can find more information about it in the [Apify SDK documentation](https://docs.apify.com/sdk/python/docs/overview/introduction). + +```mermaid +--- +config: + class: + hideEmptyMembersBox: true +--- + +classDiagram + +%% ======================== +%% Abstract classes +%% ======================== + +class StorageClient { + <<abstract>> +} + +%% ======================== +%% Specific classes +%% ======================== + +class MemoryStorageClient + +class FileSystemStorageClient + +class ApifyStorageClient + +%% ======================== +%% Inheritance arrows +%% ======================== + +StorageClient --|> MemoryStorageClient +StorageClient --|> FileSystemStorageClient +StorageClient --|> ApifyStorageClient +``` + +Storage clients can be registered globally with the <ApiLink to="class/ServiceLocator">`ServiceLocator`</ApiLink> (you will learn more about the <ApiLink to="class/ServiceLocator">`ServiceLocator`</ApiLink> in the next section), passed directly to crawlers, or specified when opening individual storage instances. You can also create custom storage clients by implementing the <ApiLink to="class/StorageClient">`StorageClient`</ApiLink> interface. + +See the [Storage clients guide](./storage-clients) for more details. + +## Request router + +The request <ApiLink to="class/Router">`Router`</ApiLink> is a central component that manages the flow of requests and responses in Crawlee. It is responsible for routing requests to the appropriate request handlers, managing the crawling context, and coordinating the execution of user-defined logic. + +### Request handlers + +Request handlers are user-defined functions that process requests and responses in Crawlee. They are the core of the crawling logic and are responsible for handling data extraction, processing, and storage. Each request handler receives a crawling context as an argument, which provides access to request data, response data, and other information related to the request. Request handlers can be registered with the <ApiLink to="class/Router">`Router`</ApiLink>. + +The request routing in Crawlee supports: +- Default handlers - Fallback handlers for requests without specific labels. +- Label-based routing - Handlers for specific request types based on labels. +- Error handlers - Handle errors during request processing. +- Failed request handlers - Handle requests that exceed retry limits. +- Pre-navigation hooks - Execute logic before navigating to URLs. + +See the [Request router guide](./request-router) for detailed information and examples. + +## Service locator + +The <ApiLink to="class/ServiceLocator">`ServiceLocator`</ApiLink> is a central registry for global services in Crawlee. It manages and provides access to core services throughout the framework, ensuring consistent configuration across all components. The service locator coordinates these three services: + +- <ApiLink to="class/Configuration">`Configuration`</ApiLink> - Application-wide settings and parameters that control various aspects of Crawlee behavior. +- <ApiLink to="class/StorageClient">`StorageClient`</ApiLink> - Backend implementation for data storage across datasets, key-value stores, and request queues. +- <ApiLink to="class/EventManager">`EventManager`</ApiLink> - Event coordination system for internal framework events and custom user hooks. + +Services can be registered globally through the `service_locator` singleton instance, passed to crawler constructors, or provided when opening individual storage instances. The service locator includes conflict prevention mechanisms to ensure configuration consistency and prevent accidental service conflicts during runtime. + +See the [Service locator guide](./service-locator) for detailed information about service registration and configuration options. + +## Request loaders + +Request loaders provide a subset of <ApiLink to="class/RequestQueue">`RequestQueue`</ApiLink> functionality, focusing specifically on reading and accessing streams of requests from various sources. They define how requests are fetched and processed, enabling use cases such as reading URLs from files, external APIs, sitemaps, or combining multiple sources together. Unlike request queues, they do not handle storage or persistenceโ€”they only provide request reading capabilities. + +- <ApiLink to="class/RequestLoader">`RequestLoader`</ApiLink> - Base interface for read-only access to a stream of requests, with capabilities like fetching the next request, marking as handled, and status checking. +- <ApiLink to="class/RequestList">`RequestList`</ApiLink> - Lightweight in-memory implementation of `RequestLoader` for managing static lists of URLs. +- <ApiLink to="class/SitemapRequestLoader">`SitemapRequestLoader`</ApiLink> - A specialized loader that reads URLs from XML and plain-text sitemaps following the [Sitemaps protocol](https://www.sitemaps.org/protocol.html) with filtering capabilities. + +### Request managers + +<ApiLink to="class/RequestManager">`RequestManager`</ApiLink> extends <ApiLink to="class/RequestLoader">`RequestLoader`</ApiLink> with write capabilities for adding and reclaiming requests, providing full request management functionality. <ApiLink to="class/RequestQueue">`RequestQueue`</ApiLink> is the primary concrete implementation of <ApiLink to="class/RequestManager">`RequestManager`</ApiLink>. + +<ApiLink to="class/RequestManagerTandem">`RequestManagerTandem`</ApiLink> combines a read-only `RequestLoader` with a writable <ApiLink to="class/RequestManager">`RequestManager`</ApiLink>, transferring requests from the loader to the manager for hybrid scenarios. This is useful when you want to start with a predefined set of URLs (from a file or sitemap) but also need to add new requests dynamically during crawling. The tandem first processes all requests from the loader, then handles any additional requests added to the manager. + +Request loaders are useful when you need to start with a predefined set of URLs. The tandem approach allows processing requests from static sources (like files or sitemaps) while maintaining the ability to add new requests dynamically. + +See the [Request loaders guide](./request-loaders) for detailed information. + +## Event manager + +The <ApiLink to="class/EventManager">`EventManager`</ApiLink> is responsible for coordinating internal events throughout Crawlee and enabling custom hooks. It provides a system for registering event listeners, emitting events, and managing their execution lifecycle. + +Crawlee provides several implementations of the event manager: + +- <ApiLink to="class/EventManager">`EventManager`</ApiLink> is the base class for event management in Crawlee. +- <ApiLink to="class/LocalEventManager">`LocalEventManager`</ApiLink> extends the base event manager for local environments by automatically emitting `SYSTEM_INFO` events at regular intervals. This provides real-time system metrics including CPU usage and memory consumption, which are essential for internal components like the <ApiLink to="class/Snapshotter">`Snapshotter`</ApiLink> and <ApiLink to="class/AutoscaledPool">`AutoscaledPool`</ApiLink>. +- [`ApifyEventManager`](https://docs.apify.com/sdk/python/reference/class/PlatformEventManager) - Manages events on the [Apify platform](https://apify.com/) (cloud-based). It is implemented in the [Apify SDK](https://docs.apify.com/sdk/python/). + +:::info + +You can learn more about <ApiLink to="class/Snapshotter">`Snapshotter`</ApiLink> and <ApiLink to="class/AutoscaledPool">`AutoscaledPool`</ApiLink> and their configuration in the [Scaling crawlers guide](./scaling-crawlers). + +::: + +Crawlee defines several built-in event types: + +- `PERSIST_STATE` - Emitted periodically to trigger state persistence. +- `SYSTEM_INFO` - Contains CPU and memory usage information. +- `MIGRATING` - Signals that the crawler is migrating to a different environment. +- `ABORTING` - Indicates the crawler is aborting execution. +- `EXIT` - Emitted when the crawler is exiting. +- `CRAWLER_STATUS` - Provides status updates from crawlers. + +Additional specialized events for browser and session management are also available. + +The event manager operates as an async context manager, automatically starting periodic tasks when entered and ensuring all listeners complete before exiting. Event listeners can be either synchronous or asynchronous functions and are executed safely without blocking the main event loop. + +```mermaid +--- +config: + class: + hideEmptyMembersBox: true +--- + +classDiagram + +%% ======================== +%% Abstract classes +%% ======================== + +class EventManager { + <<abstract>> +} + +%% ======================== +%% Specific classes +%% ======================== + +class LocalEventManager + +class ApifyEventManager + +%% ======================== +%% Inheritance arrows +%% ======================== + +EventManager --|> LocalEventManager +EventManager --|> ApifyEventManager +``` + +## Session management + +The core component of session management in Crawlee is <ApiLink to="class/SessionPool">`SessionPool`</ApiLink>. It manages a collection of sessions that simulate individual users with unique attributes like cookies, IP addresses (via proxies), and browser fingerprints. Sessions help avoid blocking by rotating user identities and maintaining realistic browsing patterns. + +:::info + +You can learn more about fingerprints and how to avoid getting blocked in the [Avoid blocking guide](./avoid-blocking). + +::: + +### Session + +A session is represented as a <ApiLink to="class/Session">`Session`</ApiLink> object, which contains components like cookies, error tracking, usage limits, and expiration handling. Sessions can be marked as good (<ApiLink to="class/Session#mark_good">`Session.mark_good`</ApiLink>), bad (<ApiLink to="class/Session#mark_bad">`Session.mark_bad`</ApiLink>), or retired (<ApiLink to="class/Session#retire">`Session.retire`</ApiLink>) based on their performance, and they automatically become unusable when they exceed error thresholds or usage limits. + +### Session pool + +The session pool provides automated session lifecycle management: + +- Automatic rotation - Retrieves random sessions from the pool and creates new ones as needed. +- Pool maintenance - Removes retired sessions and maintains the pool at maximum capacity. +- State persistence - Persists session state to enable recovery across restarts. +- Configurable limits - Supports custom pool sizes, session settings, and creation functions. + +The pool operates as an async context manager, automatically initializing with sessions and cleaning up on exit. It ensures proper session management by rotating sessions based on usage count, expiration time, and custom rules while maintaining optimal pool size. + +See the [Session management guide](./session-management) for more information. + +## Statistics + +The <ApiLink to="class/Statistics">`Statistics`</ApiLink> class provides runtime monitoring for crawler operations, tracking performance metrics like request counts, processing times, retry attempts, and error patterns. It operates as an async context manager, automatically persisting data across crawler restarts and migrations using <ApiLink to="class/KeyValueStore">`KeyValueStore`</ApiLink>. + +The system includes error tracking through the <ApiLink to="class/ErrorTracker">`ErrorTracker`</ApiLink> class, which groups similar errors by type and message patterns using wildcard matching. It can capture HTML snapshots and screenshots for debugging and separately track retry-specific errors. + +Statistics are logged at configurable intervals in both table and inline formats, with final summary data returned from the `crawler.run` method available through <ApiLink to="class/FinalStatistics">`FinalStatistics`</ApiLink>. + +## Conclusion + +In this guide, we provided a high-level overview of the core components of the Crawlee library and its architecture. We covered the main components like crawlers, crawling contexts, storages, request routers, service locator, request loaders, event manager, session management, and statistics. Check out other guides, the [API reference](https://crawlee.dev/python/api), and [Examples](../examples) for more details on how to use these components in your own projects. + +If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/crawlee-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping! diff --git a/website/versioned_docs/version-1.6/guides/avoid_blocking.mdx b/website/versioned_docs/version-1.6/guides/avoid_blocking.mdx new file mode 100644 index 0000000000..423338dcfe --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/avoid_blocking.mdx @@ -0,0 +1,47 @@ +--- +id: avoid-blocking +title: Avoid getting blocked +description: How to avoid getting blocked when scraping +--- + +import ApiLink from '@site/src/components/ApiLink'; +import CodeBlock from '@theme/CodeBlock'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; + +import PlaywrightDefaultFingerprintGenerator from '!!raw-loader!roa-loader!./code_examples/avoid_blocking/playwright_with_fingerprint_generator.py'; +import PlaywrightWithCamoufox from '!!raw-loader!roa-loader!../examples/code_examples/playwright_crawler_with_camoufox.py'; + +import PlaywrightDefaultFingerprintGeneratorWithArgs from '!!raw-loader!./code_examples/avoid_blocking/default_fingerprint_generator_with_args.py'; + +A scraper might get blocked for numerous reasons. Let's narrow it down to the two main ones. The first is a bad or blocked IP address. You can learn about this topic in the [proxy management guide](./proxy-management). The second reason is [browser fingerprints](https://pixelprivacy.com/resources/browser-fingerprinting/) (or signatures), which we will explore more in this guide. Check the [Apify Academy anti-scraping course](https://docs.apify.com/academy/anti-scraping) to gain a deeper theoretical understanding of blocking and learn a few tips and tricks. + +Browser fingerprint is a collection of browser attributes and significant features that can show if our browser is a bot or a real user. Moreover, most browsers have these unique features that allow the website to track the browser even within different IP addresses. This is the main reason why scrapers should change browser fingerprints while doing browser-based scraping. In return, it should significantly reduce the blocking. + +## Using browser fingerprints + +Changing browser fingerprints can be a tedious job. Luckily, Crawlee provides this feature with minimal configuration necessary - the usage of fingerprints in <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> is enabled by default. You can customize the fingerprints by using the `fingerprint_generator` argument of the <ApiLink to="class/PlaywrightCrawler#__init__">`PlaywrightCrawler.__init__`</ApiLink>, either pass your own implementation of <ApiLink to="class/FingerprintGenerator">`FingerprintGenerator`</ApiLink> or use <ApiLink to="class/BrowserforgeFingerprintGenerator">`DefaultFingerprintGenerator`</ApiLink>. + +<RunnableCodeBlock className="language-python" language="python"> + {PlaywrightDefaultFingerprintGenerator} +</RunnableCodeBlock> + +In certain cases we want to narrow down the fingerprints used - e.g. specify a certain operating system, locale or browser. This is also possible with Crawlee - the crawler can have the generation algorithm customized to reflect the particular browser version and many more. For description of fingerprint generation options please see <ApiLink to="class/HeaderGeneratorOptions">`HeaderGeneratorOptions`</ApiLink>, <ApiLink to="class/ScreenOptions">`ScreenOptions`</ApiLink> and <ApiLink to="class/BrowserforgeFingerprintGenerator#__init__">`DefaultFingerprintGenerator.__init__`</ApiLink> See the example below: + +<CodeBlock className="language-python"> + {PlaywrightDefaultFingerprintGeneratorWithArgs} +</CodeBlock> + +If you do not want to use fingerprints, then pass `fingerprint_generator=None` argument to the <ApiLink to="class/PlaywrightCrawler#__init__">`PlaywrightCrawler.__init__`</ApiLink>. + +## Using Camoufox + +In some cases even <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> with fingerprints is not enough. You can try using <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> together with [Camoufox](https://camoufox.com/). See the example integration below: + +<RunnableCodeBlock className="language-python" language="python"> + {PlaywrightWithCamoufox} +</RunnableCodeBlock> + +**Related links** + +- [Fingerprint Suite Docs](https://github.com/apify/fingerprint-suite) +- [Apify Academy anti-scraping course](https://docs.apify.com/academy/anti-scraping) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/avoid_blocking/default_fingerprint_generator_with_args.py b/website/versioned_docs/version-1.6/guides/code_examples/avoid_blocking/default_fingerprint_generator_with_args.py new file mode 100644 index 0000000000..4e6ed92aa6 --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/avoid_blocking/default_fingerprint_generator_with_args.py @@ -0,0 +1,20 @@ +import asyncio + +from crawlee.fingerprint_suite import ( + DefaultFingerprintGenerator, + HeaderGeneratorOptions, + ScreenOptions, +) + + +async def main() -> None: + fingerprint_generator = DefaultFingerprintGenerator( + header_options=HeaderGeneratorOptions(browsers=['chrome']), + screen_options=ScreenOptions(min_width=400), + ) + + # ... + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/avoid_blocking/playwright_with_fingerprint_generator.py b/website/versioned_docs/version-1.6/guides/code_examples/avoid_blocking/playwright_with_fingerprint_generator.py new file mode 100644 index 0000000000..5e1c8d2668 --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/avoid_blocking/playwright_with_fingerprint_generator.py @@ -0,0 +1,23 @@ +import asyncio + +from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext + + +async def main() -> None: + # Fingerprint generator is used by default. + crawler = PlaywrightCrawler() + + # Define the default request handler, which will be called for every request. + @crawler.router.default_handler + async def request_handler(context: PlaywrightCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # Find a link to the next page and enqueue it if it exists. + await context.enqueue_links(selector='.morelink') + + # Run the crawler with the initial list of URLs. + await crawler.run(['https://news.ycombinator.com/']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/creating_web_archive/manual_archiving_parsel_crawler.py b/website/versioned_docs/version-1.6/guides/code_examples/creating_web_archive/manual_archiving_parsel_crawler.py new file mode 100644 index 0000000000..58e5cfed2a --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/creating_web_archive/manual_archiving_parsel_crawler.py @@ -0,0 +1,61 @@ +import asyncio +import io +from pathlib import Path + +from warcio.statusandheaders import StatusAndHeaders +from warcio.warcwriter import WARCWriter + +from crawlee.crawlers import ParselCrawler, ParselCrawlingContext + + +async def archive_response(context: ParselCrawlingContext, writer: WARCWriter) -> None: + """Helper function for archiving response in WARC format.""" + # Create WARC records for response + response_body = await context.http_response.read() + response_payload_stream = io.BytesIO(response_body) + + response_headers = StatusAndHeaders( + str(context.http_response.status_code), + context.http_response.headers, + protocol='HTTP/1.1', + ) + response_record = writer.create_warc_record( + context.request.url, + 'response', + payload=response_payload_stream, + length=len(response_body), + http_headers=response_headers, + ) + writer.write_record(response_record) + + +async def main() -> None: + crawler = ParselCrawler( + max_requests_per_crawl=10, + ) + + # Create a WARC archive file a prepare the writer. + archive = Path('example.warc.gz') + with archive.open('wb') as output: + writer = WARCWriter(output, gzip=True) + + # Create a WARC info record to store metadata about the archive. + warcinfo_payload = { + 'software': 'Crawlee', + 'format': 'WARC/1.1', + 'description': 'Example archive created with ParselCrawler', + } + writer.write_record(writer.create_warcinfo_record(archive.name, warcinfo_payload)) + + # Define the default request handler, which will be called for every request. + @crawler.router.default_handler + async def request_handler(context: ParselCrawlingContext) -> None: + context.log.info(f'Archiving {context.request.url} ...') + await archive_response(context=context, writer=writer) + await context.enqueue_links(strategy='same-domain') + + await crawler.run(['https://crawlee.dev/']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/creating_web_archive/manual_archiving_playwright_crawler.py b/website/versioned_docs/version-1.6/guides/code_examples/creating_web_archive/manual_archiving_playwright_crawler.py new file mode 100644 index 0000000000..955156e3cf --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/creating_web_archive/manual_archiving_playwright_crawler.py @@ -0,0 +1,85 @@ +import asyncio +import io +import logging +from functools import partial +from pathlib import Path + +from playwright.async_api import Request +from warcio.statusandheaders import StatusAndHeaders +from warcio.warcwriter import WARCWriter + +from crawlee.crawlers import ( + PlaywrightCrawler, + PlaywrightCrawlingContext, + PlaywrightPreNavCrawlingContext, +) + + +async def archive_response( + request: Request, writer: WARCWriter, logger: logging.Logger +) -> None: + """Helper function for archiving response in WARC format.""" + response = await request.response() + if not response: + logger.warning(f'Could not get response {request.url}') + return + try: + response_body = await response.body() + except Exception as e: + logger.warning(f'Could not get response body for {response.url}: {e}') + return + logger.info(f'Archiving resource {response.url}') + response_payload_stream = io.BytesIO(response_body) + response_headers = StatusAndHeaders( + str(response.status), response.headers, protocol='HTTP/1.1' + ) + response_record = writer.create_warc_record( + response.url, + 'response', + payload=response_payload_stream, + length=len(response_body), + http_headers=response_headers, + ) + writer.write_record(response_record) + + +async def main() -> None: + crawler = PlaywrightCrawler( + max_requests_per_crawl=1, + headless=False, + ) + + # Create a WARC archive file a prepare the writer. + archive = Path('example.warc.gz') + with archive.open('wb') as output: + writer = WARCWriter(output, gzip=True) + + # Create a WARC info record to store metadata about the archive. + warcinfo_payload = { + 'software': 'Crawlee', + 'format': 'WARC/1.1', + 'description': 'Example archive created with PlaywrightCrawler', + } + writer.write_record(writer.create_warcinfo_record(archive.name, warcinfo_payload)) + + @crawler.pre_navigation_hook + async def archiving_hook(context: PlaywrightPreNavCrawlingContext) -> None: + # Ensure that all responses with additional resources are archived + context.page.on( + 'requestfinished', + partial(archive_response, logger=context.log, writer=writer), + ) + + @crawler.router.default_handler + async def request_handler(context: PlaywrightCrawlingContext) -> None: + # For some sites, where the content loads dynamically, + # it is needed to scroll the page to load all content. + # It slows down the crawling, but ensures that all content is loaded. + await context.infinite_scroll() + await context.enqueue_links(strategy='same-domain') + + await crawler.run(['https://crawlee.dev/']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/creating_web_archive/simple_pw_through_proxy_pywb_server.py b/website/versioned_docs/version-1.6/guides/code_examples/creating_web_archive/simple_pw_through_proxy_pywb_server.py new file mode 100644 index 0000000000..39be5d1bee --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/creating_web_archive/simple_pw_through_proxy_pywb_server.py @@ -0,0 +1,30 @@ +import asyncio + +from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext +from crawlee.proxy_configuration import ProxyConfiguration + + +async def main() -> None: + crawler = PlaywrightCrawler( + # Use the local wayback server as a proxy + proxy_configuration=ProxyConfiguration(proxy_urls=['http://localhost:8080/']), + # Ignore the HTTPS errors if you have not followed pywb CA setup instructions + browser_launch_options={'ignore_https_errors': True}, + max_requests_per_crawl=10, + headless=False, + ) + + @crawler.router.default_handler + async def request_handler(context: PlaywrightCrawlingContext) -> None: + context.log.info(f'Archiving {context.request.url} ...') + # For some sites, where the content loads dynamically, + # it is needed to scroll the page to load all content. + # It slows down the crawling, but ensures that all content is loaded. + await context.infinite_scroll() + await context.enqueue_links(strategy='same-domain') + + await crawler.run(['https://crawlee.dev/']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/error_handling/change_handle_error_status.py b/website/versioned_docs/version-1.6/guides/code_examples/error_handling/change_handle_error_status.py new file mode 100644 index 0000000000..55bf5a0e61 --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/error_handling/change_handle_error_status.py @@ -0,0 +1,45 @@ +import asyncio +import json + +from crawlee import HttpHeaders +from crawlee.crawlers import HttpCrawler, HttpCrawlingContext +from crawlee.errors import HttpStatusCodeError +from crawlee.sessions import SessionPool + +# Using a placeholder refresh token for this example +REFRESH_TOKEN = 'PLACEHOLDER' +UNAUTHORIZED_CODE = 401 + + +async def main() -> None: + crawler = HttpCrawler( + max_request_retries=2, + # Only treat 403 as a blocking status code, not 401 + session_pool=SessionPool(create_session_settings={'blocked_status_codes': [403]}), + # Don't treat 401 responses as errors + ignore_http_error_status_codes=[UNAUTHORIZED_CODE], + ) + + @crawler.router.default_handler + async def default_handler(context: HttpCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + # Now we can handle 401 responses ourselves + if context.http_response.status_code == UNAUTHORIZED_CODE: + # Get a fresh access token + headers = {'authorization': f'Bearer {REFRESH_TOKEN}'} + response = await context.send_request( + 'https://placeholder.org/refresh', headers=headers + ) + data = json.loads(await response.read()) + # Add the new token to our `Request` headers + context.request.headers |= HttpHeaders( + {'authorization': f'Bearer {data["access_token"]}'}, + ) + # Trigger a retry with our updated headers + raise HttpStatusCodeError('Unauthorized', status_code=UNAUTHORIZED_CODE) + + await crawler.run(['http://httpbingo.org/status/401']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/error_handling/disable_retry.py b/website/versioned_docs/version-1.6/guides/code_examples/error_handling/disable_retry.py new file mode 100644 index 0000000000..8d98eff312 --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/error_handling/disable_retry.py @@ -0,0 +1,30 @@ +import asyncio + +from crawlee.crawlers import BasicCrawlingContext, HttpCrawler, HttpCrawlingContext +from crawlee.errors import HttpStatusCodeError, SessionError + + +async def main() -> None: + crawler = HttpCrawler(max_request_retries=5) + + # Create a parsing error for demonstration + @crawler.router.default_handler + async def default_handler(context: HttpCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + raise ValueError('Simulated parsing error') + + # This handler runs before any retry attempts + @crawler.error_handler + async def retry_handler(context: BasicCrawlingContext, error: Exception) -> None: + context.log.error(f'Failed request {context.request.url}') + # Only allow retries for network-related errors + if not isinstance(error, (SessionError, HttpStatusCodeError)): + context.log.error('Non-network error detected') + # Stop further retry attempts for this `Request` + context.request.no_retry = True + + await crawler.run(['https://crawlee.dev/']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/error_handling/handle_proxy_error.py b/website/versioned_docs/version-1.6/guides/code_examples/error_handling/handle_proxy_error.py new file mode 100644 index 0000000000..eddb843fdd --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/error_handling/handle_proxy_error.py @@ -0,0 +1,40 @@ +import asyncio + +from crawlee import Request +from crawlee.crawlers import BasicCrawlingContext, HttpCrawler, HttpCrawlingContext +from crawlee.errors import ProxyError + + +async def main() -> None: + # Set how many session rotations will happen before calling the error handler + # when ProxyError occurs + crawler = HttpCrawler(max_session_rotations=5, max_request_retries=6) + + # For this example, we'll create a proxy error in our handler + @crawler.router.default_handler + async def default_handler(context: HttpCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + raise ProxyError('Simulated proxy error') + + # This handler runs after all retry attempts are exhausted + @crawler.failed_request_handler + async def failed_handler(context: BasicCrawlingContext, error: Exception) -> None: + context.log.error(f'Failed request {context.request.url}, after 5 rotations') + request = context.request + # For proxy errors, we can add a new `Request` to try again + if isinstance(error, ProxyError) and not request.unique_key.startswith('retry'): + context.log.info(f'Retrying {request.url} ...') + # Create a new `Request` with a modified key to avoid deduplication + new_request = Request.from_url( + request.url, unique_key=f'retry{request.unique_key}' + ) + + # Add the new `Request` to the `Queue` + rq = await crawler.get_request_manager() + await rq.add_request(new_request) + + await crawler.run(['https://crawlee.dev/']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/http_clients/parsel_curl_impersonate_example.py b/website/versioned_docs/version-1.6/guides/code_examples/http_clients/parsel_curl_impersonate_example.py new file mode 100644 index 0000000000..63030b93d7 --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/http_clients/parsel_curl_impersonate_example.py @@ -0,0 +1,42 @@ +import asyncio + +from crawlee.crawlers import ParselCrawler, ParselCrawlingContext +from crawlee.http_clients import CurlImpersonateHttpClient + + +async def main() -> None: + http_client = CurlImpersonateHttpClient( + # Optional additional keyword arguments for `curl_cffi.requests.AsyncSession`. + timeout=10, + impersonate='chrome131', + ) + + crawler = ParselCrawler( + http_client=http_client, + # Limit the crawl to max requests. Remove or increase it for crawling all links. + max_requests_per_crawl=10, + ) + + # Define the default request handler, which will be called for every request. + @crawler.router.default_handler + async def request_handler(context: ParselCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # Enqueue all links from the page. + await context.enqueue_links() + + # Extract data from the page. + data = { + 'url': context.request.url, + 'title': context.selector.css('title::text').get(), + } + + # Push the extracted data to the default dataset. + await context.push_data(data) + + # Run the crawler with the initial list of URLs. + await crawler.run(['https://crawlee.dev']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/http_clients/parsel_httpx_example.py b/website/versioned_docs/version-1.6/guides/code_examples/http_clients/parsel_httpx_example.py new file mode 100644 index 0000000000..8075a6d9ef --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/http_clients/parsel_httpx_example.py @@ -0,0 +1,42 @@ +import asyncio + +from crawlee.crawlers import ParselCrawler, ParselCrawlingContext +from crawlee.http_clients import HttpxHttpClient + + +async def main() -> None: + http_client = HttpxHttpClient( + # Optional additional keyword arguments for `httpx.AsyncClient`. + timeout=10, + follow_redirects=True, + ) + + crawler = ParselCrawler( + http_client=http_client, + # Limit the crawl to max requests. Remove or increase it for crawling all links. + max_requests_per_crawl=10, + ) + + # Define the default request handler, which will be called for every request. + @crawler.router.default_handler + async def request_handler(context: ParselCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # Enqueue all links from the page. + await context.enqueue_links() + + # Extract data from the page. + data = { + 'url': context.request.url, + 'title': context.selector.css('title::text').get(), + } + + # Push the extracted data to the default dataset. + await context.push_data(data) + + # Run the crawler with the initial list of URLs. + await crawler.run(['https://crawlee.dev']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/http_clients/parsel_impit_example.py b/website/versioned_docs/version-1.6/guides/code_examples/http_clients/parsel_impit_example.py new file mode 100644 index 0000000000..5cd90ce4a8 --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/http_clients/parsel_impit_example.py @@ -0,0 +1,43 @@ +import asyncio + +from crawlee.crawlers import ParselCrawler, ParselCrawlingContext +from crawlee.http_clients import ImpitHttpClient + + +async def main() -> None: + http_client = ImpitHttpClient( + # Optional additional keyword arguments for `impit.AsyncClient`. + http3=True, + browser='firefox', + verify=True, + ) + + crawler = ParselCrawler( + http_client=http_client, + # Limit the crawl to max requests. Remove or increase it for crawling all links. + max_requests_per_crawl=10, + ) + + # Define the default request handler, which will be called for every request. + @crawler.router.default_handler + async def request_handler(context: ParselCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # Enqueue all links from the page. + await context.enqueue_links() + + # Extract data from the page. + data = { + 'url': context.request.url, + 'title': context.selector.css('title::text').get(), + } + + # Push the extracted data to the default dataset. + await context.push_data(data) + + # Run the crawler with the initial list of URLs. + await crawler.run(['https://crawlee.dev']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/http_crawlers/__init__.py b/website/versioned_docs/version-1.6/guides/code_examples/http_crawlers/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/website/versioned_docs/version-1.6/guides/code_examples/http_crawlers/beautifulsoup_example.py b/website/versioned_docs/version-1.6/guides/code_examples/http_crawlers/beautifulsoup_example.py new file mode 100644 index 0000000000..49e6fde9ec --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/http_crawlers/beautifulsoup_example.py @@ -0,0 +1,35 @@ +import asyncio + +from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext + + +async def main() -> None: + # Create a BeautifulSoupCrawler instance + crawler = BeautifulSoupCrawler( + # Limit the crawl to 10 requests + max_requests_per_crawl=10, + ) + + # Define the default request handler + @crawler.router.default_handler + async def request_handler(context: BeautifulSoupCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url}') + + # Extract data using BeautifulSoup + data = { + 'url': context.request.url, + 'title': context.soup.title.string if context.soup.title else None, + } + + # Push extracted data to the dataset + await context.push_data(data) + + # Enqueue links found on the page for further crawling + await context.enqueue_links() + + # Run the crawler + await crawler.run(['https://crawlee.dev']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/http_crawlers/custom_crawler_example.py b/website/versioned_docs/version-1.6/guides/code_examples/http_crawlers/custom_crawler_example.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/website/versioned_docs/version-1.6/guides/code_examples/http_crawlers/http_example.py b/website/versioned_docs/version-1.6/guides/code_examples/http_crawlers/http_example.py new file mode 100644 index 0000000000..a426a2ee23 --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/http_crawlers/http_example.py @@ -0,0 +1,52 @@ +import asyncio +import re + +from crawlee.crawlers import HttpCrawler, HttpCrawlingContext + + +async def main() -> None: + # Create an HttpCrawler instance - no automatic parsing + crawler = HttpCrawler( + # Limit the crawl to 10 requests + max_requests_per_crawl=10, + ) + + # Define the default request handler + @crawler.router.default_handler + async def request_handler(context: HttpCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url}') + + # Get the raw response content + response_body = await context.http_response.read() + response_text = response_body.decode('utf-8') + + # Extract title manually using regex (since we don't have a parser) + title_match = re.search( + r'<title[^>]*>([^<]+)', response_text, re.IGNORECASE + ) + title = title_match.group(1).strip() if title_match else None + + # Extract basic information + data = { + 'url': context.request.url, + 'title': title, + } + + # Push extracted data to the dataset + await context.push_data(data) + + # Simple link extraction for further crawling + href_pattern = r'href=["\']([^"\']+)["\']' + matches = re.findall(href_pattern, response_text, re.IGNORECASE) + + # Enqueue first few links found (limit to avoid too many requests) + for href in matches[:3]: + if href.startswith('http') and 'crawlee.dev' in href: + await context.add_requests([href]) + + # Run the crawler + await crawler.run(['https://crawlee.dev']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/http_crawlers/lexbor_parser.py b/website/versioned_docs/version-1.6/guides/code_examples/http_crawlers/lexbor_parser.py new file mode 100644 index 0000000000..ef279793ed --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/http_crawlers/lexbor_parser.py @@ -0,0 +1,63 @@ +import asyncio + +from pydantic import ValidationError +from selectolax.lexbor import LexborHTMLParser +from yarl import URL + +from crawlee import Request +from crawlee.crawlers import HttpCrawler, HttpCrawlingContext + + +async def main() -> None: + crawler = HttpCrawler( + max_request_retries=1, + max_requests_per_crawl=10, + ) + + @crawler.router.default_handler + async def request_handler(context: HttpCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # Parse the HTML content using Selectolax with Lexbor backend. + parsed_html = LexborHTMLParser(await context.http_response.read()) + + # Extract data from the page. + data = { + 'url': context.request.url, + 'title': parsed_html.css_first('title').text(), + 'h1s': [h1.text() for h1 in parsed_html.css('h1')], + 'h2s': [h2.text() for h2 in parsed_html.css('h2')], + 'h3s': [h3.text() for h3 in parsed_html.css('h3')], + } + await context.push_data(data) + + # Css selector to extract valid href attributes. + links_selector = ( + 'a[href]:not([href^="#"]):not([href^="javascript:"]):not([href^="mailto:"])' + ) + base_url = URL(context.request.url) + extracted_requests = [] + + # Extract links. + for item in parsed_html.css(links_selector): + href = item.attributes.get('href') + if not href: + continue + + # Convert relative URLs to absolute if needed. + url = str(base_url.join(URL(href))) + try: + request = Request.from_url(url) + except ValidationError as exc: + context.log.warning(f'Skipping invalid URL "{url}": {exc}') + continue + extracted_requests.append(request) + + # Add extracted requests to the queue with the same-domain strategy. + await context.add_requests(extracted_requests, strategy='same-domain') + + await crawler.run(['https://crawlee.dev']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/http_crawlers/lxml_parser.py b/website/versioned_docs/version-1.6/guides/code_examples/http_crawlers/lxml_parser.py new file mode 100644 index 0000000000..b50fda4293 --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/http_crawlers/lxml_parser.py @@ -0,0 +1,61 @@ +import asyncio + +from lxml import html +from pydantic import ValidationError + +from crawlee import Request +from crawlee.crawlers import HttpCrawler, HttpCrawlingContext + + +async def main() -> None: + crawler = HttpCrawler( + max_request_retries=1, + max_requests_per_crawl=10, + ) + + @crawler.router.default_handler + async def request_handler(context: HttpCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # Parse the HTML content using lxml. + parsed_html = html.fromstring(await context.http_response.read()) + + # Extract data from the page. + data = { + 'url': context.request.url, + 'title': parsed_html.findtext('.//title'), + 'h1s': [h1.text_content() for h1 in parsed_html.findall('.//h1')], + 'h2s': [h2.text_content() for h2 in parsed_html.findall('.//h2')], + 'h3s': [h3.text_content() for h3 in parsed_html.findall('.//h3')], + } + await context.push_data(data) + + # Convert relative URLs to absolute before extracting links. + parsed_html.make_links_absolute(context.request.url, resolve_base_href=True) + + # Xpath 1.0 selector for extracting valid href attributes. + links_xpath = ( + '//a/@href[not(starts-with(., "#")) ' + 'and not(starts-with(., "javascript:")) ' + 'and not(starts-with(., "mailto:"))]' + ) + + extracted_requests = [] + + # Extract links. + for url in parsed_html.xpath(links_xpath): + try: + request = Request.from_url(url) + except ValidationError as exc: + context.log.warning(f'Skipping invalid URL "{url}": {exc}') + continue + extracted_requests.append(request) + + # Add extracted requests to the queue with the same-domain strategy. + await context.add_requests(extracted_requests, strategy='same-domain') + + await crawler.run(['https://crawlee.dev']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/http_crawlers/lxml_saxonche_parser.py b/website/versioned_docs/version-1.6/guides/code_examples/http_crawlers/lxml_saxonche_parser.py new file mode 100644 index 0000000000..ac839a6164 --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/http_crawlers/lxml_saxonche_parser.py @@ -0,0 +1,77 @@ +import asyncio + +from lxml import html +from pydantic import ValidationError +from saxonche import PySaxonProcessor + +from crawlee import Request +from crawlee.crawlers import HttpCrawler, HttpCrawlingContext + + +async def main() -> None: + crawler = HttpCrawler( + max_request_retries=1, + max_requests_per_crawl=10, + ) + + # Create Saxon processor once and reuse across requests. + saxon_proc = PySaxonProcessor(license=False) + xpath_proc = saxon_proc.new_xpath_processor() + + @crawler.router.default_handler + async def request_handler(context: HttpCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # Parse HTML with lxml. + parsed_html = html.fromstring(await context.http_response.read()) + # Convert relative URLs to absolute before extracting links. + parsed_html.make_links_absolute(context.request.url, resolve_base_href=True) + # Convert parsed HTML to XML for Saxon processing. + xml = html.tostring(parsed_html, encoding='unicode', method='xml') + # Parse XML with Saxon. + parsed_xml = saxon_proc.parse_xml(xml_text=xml) + # Set the parsed context for XPath evaluation. + xpath_proc.set_context(xdm_item=parsed_xml) + + # Extract data using XPath 2.0 string() function. + data = { + 'url': context.request.url, + 'title': xpath_proc.evaluate_single('.//title/string()'), + 'h1s': [str(h) for h in (xpath_proc.evaluate('//h1/string()') or [])], + 'h2s': [str(h) for h in (xpath_proc.evaluate('//h2/string()') or [])], + 'h3s': [str(h) for h in (xpath_proc.evaluate('//h3/string()') or [])], + } + await context.push_data(data) + + # XPath 2.0 with distinct-values() to get unique links and remove fragments. + links_xpath = """ + distinct-values( + for $href in //a/@href[ + not(starts-with(., "#")) + and not(starts-with(., "javascript:")) + and not(starts-with(., "mailto:")) + ] + return replace($href, "#.*$", "") + ) + """ + + extracted_requests = [] + + # Extract links. + for item in xpath_proc.evaluate(links_xpath) or []: + url = item.string_value + try: + request = Request.from_url(url) + except ValidationError as exc: + context.log.warning(f'Skipping invalid URL "{url}": {exc}') + continue + extracted_requests.append(request) + + # Add extracted requests to the queue with the same-domain strategy. + await context.add_requests(extracted_requests, strategy='same-domain') + + await crawler.run(['https://crawlee.dev']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/http_crawlers/parsel_example.py b/website/versioned_docs/version-1.6/guides/code_examples/http_crawlers/parsel_example.py new file mode 100644 index 0000000000..a368317ba6 --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/http_crawlers/parsel_example.py @@ -0,0 +1,35 @@ +import asyncio + +from crawlee.crawlers import ParselCrawler, ParselCrawlingContext + + +async def main() -> None: + # Create a ParselCrawler instance + crawler = ParselCrawler( + # Limit the crawl to 10 requests + max_requests_per_crawl=10, + ) + + # Define the default request handler + @crawler.router.default_handler + async def request_handler(context: ParselCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url}') + + # Extract data using Parsel's XPath and CSS selectors + data = { + 'url': context.request.url, + 'title': context.selector.xpath('//title/text()').get(), + } + + # Push extracted data to the dataset + await context.push_data(data) + + # Enqueue links found on the page for further crawling + await context.enqueue_links() + + # Run the crawler + await crawler.run(['https://crawlee.dev']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/http_crawlers/pyquery_parser.py b/website/versioned_docs/version-1.6/guides/code_examples/http_crawlers/pyquery_parser.py new file mode 100644 index 0000000000..1e15e9cb5b --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/http_crawlers/pyquery_parser.py @@ -0,0 +1,64 @@ +import asyncio + +from pydantic import ValidationError +from pyquery import PyQuery +from yarl import URL + +from crawlee import Request +from crawlee.crawlers import HttpCrawler, HttpCrawlingContext + + +async def main() -> None: + crawler = HttpCrawler( + max_request_retries=1, + max_requests_per_crawl=10, + ) + + @crawler.router.default_handler + async def request_handler(context: HttpCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # Parse the HTML content using PyQuery. + parsed_html = PyQuery(await context.http_response.read()) + + # Extract data using jQuery-style selectors. + data = { + 'url': context.request.url, + 'title': parsed_html('title').text(), + 'h1s': [h1.text() for h1 in parsed_html('h1').items()], + 'h2s': [h2.text() for h2 in parsed_html('h2').items()], + 'h3s': [h3.text() for h3 in parsed_html('h3').items()], + } + await context.push_data(data) + + # Css selector to extract valid href attributes. + links_selector = ( + 'a[href]:not([href^="#"]):not([href^="javascript:"]):not([href^="mailto:"])' + ) + base_url = URL(context.request.url) + + extracted_requests = [] + + # Extract links. + for item in parsed_html(links_selector).items(): + href = item.attr('href') + if not href: + continue + + # Convert relative URLs to absolute if needed. + url = str(base_url.join(URL(str(href)))) + try: + request = Request.from_url(url) + except ValidationError as exc: + context.log.warning(f'Skipping invalid URL "{url}": {exc}') + continue + extracted_requests.append(request) + + # Add extracted requests to the queue with the same-domain strategy. + await context.add_requests(extracted_requests, strategy='same-domain') + + await crawler.run(['https://crawlee.dev']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/http_crawlers/scrapling_parser.py b/website/versioned_docs/version-1.6/guides/code_examples/http_crawlers/scrapling_parser.py new file mode 100644 index 0000000000..201b9b0cbf --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/http_crawlers/scrapling_parser.py @@ -0,0 +1,74 @@ +import asyncio + +from pydantic import ValidationError +from scrapling.parser import Selector +from yarl import URL + +from crawlee import Request +from crawlee.crawlers import HttpCrawler, HttpCrawlingContext + + +async def main() -> None: + crawler = HttpCrawler( + max_request_retries=1, + max_requests_per_crawl=10, + ) + + @crawler.router.default_handler + async def request_handler(context: HttpCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # Parse the HTML content using Scrapling. + page = Selector(await context.http_response.read(), url=context.request.url) + + # Extract data using Xpath selectors with .get_all_text method for full text + # content. + title_el = page.xpath_first('//title') + data = { + 'url': context.request.url, + 'title': title_el.text if isinstance(title_el, Selector) else title_el, + 'h1s': [ + h1.get_all_text() if isinstance(h1, Selector) else h1 + for h1 in page.xpath('//h1') + ], + 'h2s': [ + h2.get_all_text() if isinstance(h2, Selector) else h2 + for h2 in page.xpath('//h2') + ], + 'h3s': [ + h3.get_all_text() if isinstance(h3, Selector) else h3 + for h3 in page.xpath('//h3') + ], + } + await context.push_data(data) + + # Css selector to extract valid href attributes. + links_selector = ( + 'a[href]:not([href^="#"]):not([href^="javascript:"]):not([href^="mailto:"])' + ) + base_url = URL(context.request.url) + extracted_requests = [] + + # Extract links. + for item in page.css(links_selector): + href = item.attrib.get('href') if isinstance(item, Selector) else None + if not href: + continue + + # Convert relative URLs to absolute if needed. + url = str(base_url.join(URL(href))) + try: + request = Request.from_url(url) + except ValidationError as exc: + context.log.warning(f'Skipping invalid URL "{url}": {exc}') + continue + extracted_requests.append(request) + + # Add extracted requests to the queue with the same-domain strategy. + await context.add_requests(extracted_requests, strategy='same-domain') + + await crawler.run(['https://crawlee.dev']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/http_crawlers/selectolax_adaptive_run.py b/website/versioned_docs/version-1.6/guides/code_examples/http_crawlers/selectolax_adaptive_run.py new file mode 100644 index 0000000000..c554e8db64 --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/http_crawlers/selectolax_adaptive_run.py @@ -0,0 +1,34 @@ +import asyncio + +from crawlee.crawlers import ( + AdaptivePlaywrightCrawler, + AdaptivePlaywrightCrawlingContext, +) + +from .selectolax_parser import SelectolaxLexborParser + + +async def main() -> None: + crawler: AdaptivePlaywrightCrawler = AdaptivePlaywrightCrawler( + max_requests_per_crawl=10, + # Use custom Selectolax parser for static content parsing. + static_parser=SelectolaxLexborParser(), + ) + + @crawler.router.default_handler + async def handle_request(context: AdaptivePlaywrightCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + data = { + 'url': context.request.url, + 'title': await context.query_selector_one('title'), + } + + await context.push_data(data) + + await context.enqueue_links() + + await crawler.run(['https://crawlee.dev/']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/http_crawlers/selectolax_context.py b/website/versioned_docs/version-1.6/guides/code_examples/http_crawlers/selectolax_context.py new file mode 100644 index 0000000000..3a34e20d8d --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/http_crawlers/selectolax_context.py @@ -0,0 +1,35 @@ +from dataclasses import dataclass, fields + +from selectolax.lexbor import LexborHTMLParser +from typing_extensions import Self + +from crawlee.crawlers._abstract_http import ParsedHttpCrawlingContext + + +# Custom context for Selectolax parser, you can add your own methods here +# to facilitate working with the parsed document. +@dataclass(frozen=True) +class SelectolaxLexborContext(ParsedHttpCrawlingContext[LexborHTMLParser]): + """Crawling context providing access to the parsed page. + + This context is passed to request handlers and includes all standard + context methods (push_data, enqueue_links, etc.) plus custom helpers. + """ + + @property + def parser(self) -> LexborHTMLParser: + """Convenient alias for accessing the parsed document.""" + return self.parsed_content + + @classmethod + def from_parsed_http_crawling_context( + cls, context: ParsedHttpCrawlingContext[LexborHTMLParser] + ) -> Self: + """Create custom context from the base context. + + Copies all fields from the base context to preserve framework + functionality while adding custom interface. + """ + return cls( + **{field.name: getattr(context, field.name) for field in fields(context)} + ) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/http_crawlers/selectolax_crawler.py b/website/versioned_docs/version-1.6/guides/code_examples/http_crawlers/selectolax_crawler.py new file mode 100644 index 0000000000..677a6a3b00 --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/http_crawlers/selectolax_crawler.py @@ -0,0 +1,47 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from selectolax.lexbor import LexborHTMLParser, LexborNode + +from crawlee.crawlers import AbstractHttpCrawler, HttpCrawlerOptions + +from .selectolax_context import SelectolaxLexborContext +from .selectolax_parser import SelectolaxLexborParser + +if TYPE_CHECKING: + from collections.abc import AsyncGenerator + + from typing_extensions import Unpack + + from crawlee.crawlers._abstract_http import ParsedHttpCrawlingContext + + +# Custom crawler using custom context, It is optional and you can use +# AbstractHttpCrawler directly with SelectolaxLexborParser if you don't need +# any custom context methods. +class SelectolaxLexborCrawler( + AbstractHttpCrawler[SelectolaxLexborContext, LexborHTMLParser, LexborNode] +): + """Custom crawler using Selectolax Lexbor for HTML parsing.""" + + def __init__( + self, + **kwargs: Unpack[HttpCrawlerOptions[SelectolaxLexborContext]], + ) -> None: + # Final step converts the base context to custom context type. + async def final_step( + context: ParsedHttpCrawlingContext[LexborHTMLParser], + ) -> AsyncGenerator[SelectolaxLexborContext, None]: + # Yield custom context wrapping with additional functionality around the base + # context. + yield SelectolaxLexborContext.from_parsed_http_crawling_context(context) + + # Build context pipeline: HTTP request -> parsing -> custom context. + kwargs['_context_pipeline'] = ( + self._create_static_content_crawler_pipeline().compose(final_step) + ) + super().__init__( + parser=SelectolaxLexborParser(), + **kwargs, + ) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/http_crawlers/selectolax_crawler_run.py b/website/versioned_docs/version-1.6/guides/code_examples/http_crawlers/selectolax_crawler_run.py new file mode 100644 index 0000000000..52c25ac4da --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/http_crawlers/selectolax_crawler_run.py @@ -0,0 +1,27 @@ +import asyncio + +from .selectolax_crawler import SelectolaxLexborContext, SelectolaxLexborCrawler + + +async def main() -> None: + crawler = SelectolaxLexborCrawler( + max_requests_per_crawl=10, + ) + + @crawler.router.default_handler + async def handle_request(context: SelectolaxLexborContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + data = { + 'url': context.request.url, + 'title': context.parser.css_first('title').text(), + } + + await context.push_data(data) + await context.enqueue_links() + + await crawler.run(['https://crawlee.dev/']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/http_crawlers/selectolax_parser.py b/website/versioned_docs/version-1.6/guides/code_examples/http_crawlers/selectolax_parser.py new file mode 100644 index 0000000000..b8fca8b38c --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/http_crawlers/selectolax_parser.py @@ -0,0 +1,60 @@ +from __future__ import annotations + +import asyncio +from typing import TYPE_CHECKING + +from selectolax.lexbor import LexborHTMLParser, LexborNode +from typing_extensions import override + +from crawlee.crawlers._abstract_http import AbstractHttpParser + +if TYPE_CHECKING: + from collections.abc import Iterable, Sequence + + from crawlee.http_clients import HttpResponse + + +class SelectolaxLexborParser(AbstractHttpParser[LexborHTMLParser, LexborNode]): + """Parser for parsing HTTP response using Selectolax Lexbor.""" + + @override + async def parse(self, response: HttpResponse) -> LexborHTMLParser: + """Parse HTTP response body into a document object.""" + response_body = await response.read() + # Run parsing in a thread to avoid blocking the event loop. + return await asyncio.to_thread(LexborHTMLParser, response_body) + + @override + async def parse_text(self, text: str) -> LexborHTMLParser: + """Parse raw HTML string into a document object.""" + return LexborHTMLParser(text) + + @override + async def select( + self, parsed_content: LexborHTMLParser, selector: str + ) -> Sequence[LexborNode]: + """Select elements matching a CSS selector.""" + return tuple(item for item in parsed_content.css(selector)) + + @override + def is_matching_selector( + self, parsed_content: LexborHTMLParser, selector: str + ) -> bool: + """Check if any element matches the selector.""" + return parsed_content.css_first(selector) is not None + + @override + def find_links( + self, parsed_content: LexborHTMLParser, selector: str, attribute: str + ) -> Iterable[str]: + """Extract href attributes from elements matching the selector. + + Used by `enqueue_links` helper to discover URLs. + """ + link: LexborNode + urls: list[str] = [] + for link in parsed_content.css(selector): + url = link.attributes.get(attribute) + if url: + urls.append(url.strip()) + return urls diff --git a/website/versioned_docs/version-1.6/guides/code_examples/login_crawler/http_login.py b/website/versioned_docs/version-1.6/guides/code_examples/login_crawler/http_login.py new file mode 100644 index 0000000000..2b7cb6050a --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/login_crawler/http_login.py @@ -0,0 +1,85 @@ +import asyncio +import json +from datetime import datetime, timedelta + +from crawlee import ConcurrencySettings, Request +from crawlee.crawlers import ( + HttpCrawler, + HttpCrawlingContext, +) +from crawlee.sessions import SessionPool + + +async def main() -> None: + crawler = HttpCrawler( + max_requests_per_crawl=10, + # Configure to use a single persistent session throughout the crawl + max_session_rotations=0, + # Limit request rate to avoid triggering anti-scraping measures + concurrency_settings=ConcurrencySettings(max_tasks_per_minute=30), + session_pool=SessionPool( + max_pool_size=1, + create_session_settings={ + # Set high value to ensure the session isn't replaced during crawling + 'max_usage_count': 999_999, + # Set high value to prevent session expiration during crawling + 'max_age': timedelta(hours=999_999), + # Higher error tolerance before the session is considered blocked + # Make sure you implement proper error handling in your code + 'max_error_score': 100, + }, + ), + ) + + # Default request handler for normal page processing + @crawler.router.default_handler + async def request_handler(context: HttpCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # Specialized handler for the login API request + @crawler.router.handler('login') + async def login_handler(context: HttpCrawlingContext) -> None: + context.log.info(f'Processing login at {context.request.url} ...') + + # Verify that a session is available before proceeding + if not context.session: + raise RuntimeError('Session not found') + + # Parse the API response containing authentication tokens and user data + data = json.loads(await context.http_response.read()) + + # Extract authentication data from the response + token = data['token'] + expires = data['expires'].replace('Z', '+00:00') + expires_int = int(datetime.fromisoformat(expires).timestamp()) + user_id = data['userId'] + username = data['username'] + + # Set authentication cookies in the session that will be used + # for subsequent requests + context.session.cookies.set(name='token', value=token, expires=expires_int) + context.session.cookies.set(name='userID', value=user_id) + context.session.cookies.set(name='userName', value=username) + + # After successful authentication, continue crawling with the + # authenticated session + await context.add_requests(['https://demoqa.com/BookStore/v1/Books']) + + # Create a POST request to the authentication API endpoint + # This will trigger the login_handler when executed + request = Request.from_url( + 'https://demoqa.com/Account/v1/Login', + label='login', + method='POST', + payload=json.dumps( + {'userName': 'crawlee_test', 'password': 'Test1234!'} + ).encode(), + headers={'Content-Type': 'application/json'}, + ) + + # Start the crawling process with the login request + await crawler.run([request]) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/login_crawler/playwright_login.py b/website/versioned_docs/version-1.6/guides/code_examples/login_crawler/playwright_login.py new file mode 100644 index 0000000000..9530fc1e00 --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/login_crawler/playwright_login.py @@ -0,0 +1,70 @@ +import asyncio +from datetime import timedelta + +from crawlee import ConcurrencySettings, Request +from crawlee.crawlers import ( + PlaywrightCrawler, + PlaywrightCrawlingContext, +) +from crawlee.sessions import SessionPool + + +async def main() -> None: + crawler = PlaywrightCrawler( + max_requests_per_crawl=10, + headless=True, + browser_type='chromium', + # We only have one session and it shouldn't rotate + max_session_rotations=0, + # Limit crawling intensity to avoid blocking + concurrency_settings=ConcurrencySettings(max_tasks_per_minute=30), + session_pool=SessionPool( + # Limit the pool to one session + max_pool_size=1, + create_session_settings={ + # High value for session usage limit + 'max_usage_count': 999_999, + # High value for session lifetime + 'max_age': timedelta(hours=999_999), + # High score allows the session to encounter more errors + # before crawlee decides the session is blocked + # Make sure you know how to handle these errors + 'max_error_score': 100, + }, + ), + ) + + # The main handler for processing requests + @crawler.router.default_handler + async def request_handler(context: PlaywrightCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # A handler for the login page + @crawler.router.handler('login') + async def login_handler(context: PlaywrightCrawlingContext) -> None: + context.log.info(f'Processing login {context.request.url} ...') + + # Check if the session is available + if not context.session: + raise RuntimeError('Session not found') + + # Entering data into the form, `delay` to simulate human typing + # Without this, the data will be entered instantly + await context.page.type('#userName', 'crawlee_test', delay=100) + await context.page.type('#password', 'Test1234!', delay=100) + await context.page.click('#login', delay=100) + + # Wait for an element confirming that we have successfully + # logged in to the site + await context.page.locator('#userName-value').first.wait_for(state='visible') + context.log.info('Login successful!') + + # Moving on to the basic flow of crawling + await context.add_requests(['https://demoqa.com/books']) + + # We start crawling with login. This is necessary to access the rest of the pages + await crawler.run([Request.from_url('https://demoqa.com/login', label='login')]) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/playwright_crawler/browser_configuration_example.py b/website/versioned_docs/version-1.6/guides/code_examples/playwright_crawler/browser_configuration_example.py new file mode 100644 index 0000000000..10ff84eba0 --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/playwright_crawler/browser_configuration_example.py @@ -0,0 +1,43 @@ +import asyncio + +from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext + + +async def main() -> None: + crawler = PlaywrightCrawler( + headless=False, + browser_type='chromium', + # Browser launch options + browser_launch_options={ + # For support `msedge` channel you need to install it + # `playwright install msedge` + 'channel': 'msedge', + 'slow_mo': 200, + }, + # Context launch options, applied to each page as it is created + browser_new_context_options={ + 'color_scheme': 'dark', + # Set headers + 'extra_http_headers': { + 'Custom-Header': 'my-header', + 'Accept-Language': 'en', + }, + # Set only User Agent + 'user_agent': 'My-User-Agent', + }, + # Limit the crawl to max requests. Remove or increase it for crawling all links. + max_requests_per_crawl=10, + ) + + @crawler.router.default_handler + async def request_handler(context: PlaywrightCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + await context.enqueue_links() + + # Run the crawler with the initial list of URLs. + await crawler.run(['https://crawlee.dev']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/playwright_crawler/browser_pool_page_hooks_example.py b/website/versioned_docs/version-1.6/guides/code_examples/playwright_crawler/browser_pool_page_hooks_example.py new file mode 100644 index 0000000000..08690d7bb4 --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/playwright_crawler/browser_pool_page_hooks_example.py @@ -0,0 +1,77 @@ +from __future__ import annotations + +import asyncio +import logging +from typing import TYPE_CHECKING, Any + +from crawlee.browsers import BrowserPool +from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext +from crawlee.storages import KeyValueStore + +if TYPE_CHECKING: + from crawlee.browsers._browser_controller import BrowserController + from crawlee.browsers._types import CrawleePage + from crawlee.proxy_configuration import ProxyInfo + +logger = logging.getLogger(__name__) + + +async def main() -> None: + async with BrowserPool() as browser_pool: + + @browser_pool.pre_page_create_hook + async def log_page_init( + page_id: str, + _browser_controller: BrowserController, + _browser_new_context_options: dict[str, Any], + _proxy_info: ProxyInfo | None, + ) -> None: + """Log when a new page is about to be created.""" + logger.info(f'Creating page {page_id}...') + + @browser_pool.post_page_create_hook + async def set_viewport( + crawlee_page: CrawleePage, _browser_controller: BrowserController + ) -> None: + """Set a fixed viewport size on each newly created page.""" + await crawlee_page.page.set_viewport_size({'width': 1280, 'height': 1024}) + + @browser_pool.pre_page_close_hook + async def save_screenshot( + crawlee_page: CrawleePage, _browser_controller: BrowserController + ) -> None: + """Save a screenshot to KeyValueStore before each page is closed.""" + kvs = await KeyValueStore.open() + + screenshot = await crawlee_page.page.screenshot() + await kvs.set_value( + key=f'screenshot-{crawlee_page.id}', + value=screenshot, + content_type='image/png', + ) + logger.info(f'Saved screenshot for page {crawlee_page.id}.') + + @browser_pool.post_page_close_hook + async def log_page_closed( + page_id: str, _browser_controller: BrowserController + ) -> None: + """Log after each page is closed.""" + logger.info(f'Page {page_id} closed successfully.') + + crawler = PlaywrightCrawler( + browser_pool=browser_pool, + max_requests_per_crawl=5, + ) + + @crawler.router.default_handler + async def request_handler(context: PlaywrightCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + await context.enqueue_links() + + # Run the crawler with the initial list of URLs. + await crawler.run(['https://crawlee.dev']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/playwright_crawler/multiple_launch_example.py b/website/versioned_docs/version-1.6/guides/code_examples/playwright_crawler/multiple_launch_example.py new file mode 100644 index 0000000000..59219b89c7 --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/playwright_crawler/multiple_launch_example.py @@ -0,0 +1,38 @@ +import asyncio + +from crawlee.browsers import BrowserPool, PlaywrightBrowserPlugin +from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext + + +async def main() -> None: + # Create a plugin for each required browser. + plugin_chromium = PlaywrightBrowserPlugin( + browser_type='chromium', max_open_pages_per_browser=1 + ) + plugin_firefox = PlaywrightBrowserPlugin( + browser_type='firefox', max_open_pages_per_browser=1 + ) + + crawler = PlaywrightCrawler( + browser_pool=BrowserPool(plugins=[plugin_chromium, plugin_firefox]), + # Limit the crawl to max requests. Remove or increase it for crawling all links. + max_requests_per_crawl=10, + ) + + @crawler.router.default_handler + async def request_handler(context: PlaywrightCrawlingContext) -> None: + browser_name = ( + context.page.context.browser.browser_type.name + if context.page.context.browser + else 'undefined' + ) + context.log.info(f'Processing {context.request.url} with {browser_name} ...') + + await context.enqueue_links() + + # Run the crawler with the initial list of URLs. + await crawler.run(['https://crawlee.dev', 'https://apify.com/']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/playwright_crawler/navigation_hooks_example.py b/website/versioned_docs/version-1.6/guides/code_examples/playwright_crawler/navigation_hooks_example.py new file mode 100644 index 0000000000..6abfde0dfc --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/playwright_crawler/navigation_hooks_example.py @@ -0,0 +1,42 @@ +import asyncio + +from crawlee.crawlers import ( + PlaywrightCrawler, + PlaywrightCrawlingContext, + PlaywrightPostNavCrawlingContext, + PlaywrightPreNavCrawlingContext, +) +from crawlee.errors import SessionError + + +async def main() -> None: + crawler = PlaywrightCrawler(max_requests_per_crawl=10) + + @crawler.router.default_handler + async def request_handler(context: PlaywrightCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + await context.enqueue_links() + + @crawler.pre_navigation_hook + async def configure_page(context: PlaywrightPreNavCrawlingContext) -> None: + context.log.info(f'Navigating to {context.request.url} ...') + + # block stylesheets, images, fonts and other static assets + # to speed up page loading + await context.block_requests() + + @crawler.post_navigation_hook + async def custom_captcha_check(context: PlaywrightPostNavCrawlingContext) -> None: + # check if the page contains a captcha + captcha_element = context.page.locator('input[name="captcha"]').first + if await captcha_element.is_visible(): + context.log.warning('Captcha detected! Skipping the page.') + raise SessionError('Captcha detected') + + # Run the crawler with the initial list of URLs. + await crawler.run(['https://crawlee.dev']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/playwright_crawler/plugin_browser_configuration_example.py b/website/versioned_docs/version-1.6/guides/code_examples/playwright_crawler/plugin_browser_configuration_example.py new file mode 100644 index 0000000000..6db2fb589d --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/playwright_crawler/plugin_browser_configuration_example.py @@ -0,0 +1,35 @@ +import asyncio + +from crawlee.browsers import BrowserPool, PlaywrightBrowserPlugin +from crawlee.crawlers import PlaywrightCrawler + + +async def main() -> None: + crawler = PlaywrightCrawler( + browser_pool=BrowserPool( + plugins=[ + PlaywrightBrowserPlugin( + browser_type='chromium', + browser_launch_options={ + 'headless': False, + 'channel': 'msedge', + 'slow_mo': 200, + }, + browser_new_context_options={ + 'color_scheme': 'dark', + 'extra_http_headers': { + 'Custom-Header': 'my-header', + 'Accept-Language': 'en', + }, + 'user_agent': 'My-User-Agent', + }, + ) + ] + ) + ) + + # ... + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/playwright_crawler_adaptive/handler.py b/website/versioned_docs/version-1.6/guides/code_examples/playwright_crawler_adaptive/handler.py new file mode 100644 index 0000000000..629b49449e --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/playwright_crawler_adaptive/handler.py @@ -0,0 +1,21 @@ +import asyncio +from datetime import timedelta + +from crawlee.crawlers import AdaptivePlaywrightCrawler, AdaptivePlaywrightCrawlingContext + + +async def main() -> None: + crawler = AdaptivePlaywrightCrawler.with_parsel_static_parser() + + @crawler.router.default_handler + async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: + # Locate element h2 within 5 seconds + h2 = await context.query_selector_one('h2', timedelta(milliseconds=5000)) + # Do stuff with element found by the selector + context.log.info(h2) + + await crawler.run(['https://crawlee.dev/']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/playwright_crawler_adaptive/init_beautifulsoup.py b/website/versioned_docs/version-1.6/guides/code_examples/playwright_crawler_adaptive/init_beautifulsoup.py new file mode 100644 index 0000000000..c0008d3a29 --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/playwright_crawler_adaptive/init_beautifulsoup.py @@ -0,0 +1,21 @@ +import asyncio + +from crawlee.crawlers import AdaptivePlaywrightCrawler + + +async def main() -> None: + crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser( + # Arguments relevant only for PlaywrightCrawler + playwright_crawler_specific_kwargs={ + 'headless': False, + 'browser_type': 'chromium', + }, + # Common arguments relevant to all crawlers + max_crawl_depth=5, + ) + + # ... + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/playwright_crawler_adaptive/init_parsel.py b/website/versioned_docs/version-1.6/guides/code_examples/playwright_crawler_adaptive/init_parsel.py new file mode 100644 index 0000000000..c220d53be4 --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/playwright_crawler_adaptive/init_parsel.py @@ -0,0 +1,21 @@ +import asyncio + +from crawlee.crawlers import AdaptivePlaywrightCrawler + + +async def main() -> None: + crawler = AdaptivePlaywrightCrawler.with_parsel_static_parser( + # Arguments relevant only for PlaywrightCrawler + playwright_crawler_specific_kwargs={ + 'headless': False, + 'browser_type': 'chromium', + }, + # Common arguments relevant to all crawlers + max_crawl_depth=5, + ) + + # ... + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/playwright_crawler_adaptive/init_prediction.py b/website/versioned_docs/version-1.6/guides/code_examples/playwright_crawler_adaptive/init_prediction.py new file mode 100644 index 0000000000..a8409d6150 --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/playwright_crawler_adaptive/init_prediction.py @@ -0,0 +1,72 @@ +import asyncio + +from crawlee import Request +from crawlee._types import RequestHandlerRunResult +from crawlee.crawlers import ( + AdaptivePlaywrightCrawler, + RenderingType, + RenderingTypePrediction, + RenderingTypePredictor, +) + + +class CustomRenderingTypePredictor(RenderingTypePredictor): + def __init__(self) -> None: + super().__init__() + + self._learning_data = list[tuple[Request, RenderingType]]() + + def predict(self, request: Request) -> RenderingTypePrediction: + # Some custom logic that produces some `RenderingTypePrediction` + # based on the `request` input. + rendering_type: RenderingType = ( + 'static' if 'abc' in request.url else 'client only' + ) + + return RenderingTypePrediction( + # Recommends `static` rendering type -> HTTP-based sub crawler will be used. + rendering_type=rendering_type, + # Recommends that both sub crawlers should run with 20% chance. When both sub + # crawlers are running, the predictor can compare results and learn. + # High number means that predictor is not very confident about the + # `rendering_type`, low number means that predictor is very confident. + detection_probability_recommendation=0.2, + ) + + def store_result(self, request: Request, rendering_type: RenderingType) -> None: + # This function allows predictor to store new learning data and retrain itself + # if needed. `request` is input for prediction and `rendering_type` is the correct + # prediction. + self._learning_data.append((request, rendering_type)) + # retrain + + +def result_checker(result: RequestHandlerRunResult) -> bool: + # Some function that inspects produced `result` and returns `True` if the result + # is correct. + return bool(result) # Check something on result + + +def result_comparator( + result_1: RequestHandlerRunResult, result_2: RequestHandlerRunResult +) -> bool: + # Some function that inspects two results and returns `True` if they are + # considered equivalent. It is used when comparing results produced by HTTP-based + # sub crawler and playwright based sub crawler. + return ( + result_1.push_data_calls == result_2.push_data_calls + ) # For example compare `push_data` calls. + + +async def main() -> None: + crawler = AdaptivePlaywrightCrawler.with_parsel_static_parser( + rendering_type_predictor=CustomRenderingTypePredictor(), + result_checker=result_checker, + result_comparator=result_comparator, + ) + + # ... + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/playwright_crawler_adaptive/pre_nav_hooks.py b/website/versioned_docs/version-1.6/guides/code_examples/playwright_crawler_adaptive/pre_nav_hooks.py new file mode 100644 index 0000000000..bd95bd9f8b --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/playwright_crawler_adaptive/pre_nav_hooks.py @@ -0,0 +1,39 @@ +import asyncio + +from playwright.async_api import Route + +from crawlee.crawlers import ( + AdaptivePlaywrightCrawler, + AdaptivePlaywrightPreNavCrawlingContext, +) + + +async def main() -> None: + crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser() + + @crawler.pre_navigation_hook + async def hook(context: AdaptivePlaywrightPreNavCrawlingContext) -> None: + """Hook executed both in static sub crawler and playwright sub crawler. + + Trying to access `context.page` in this hook would raise `AdaptiveContextError` + for pages crawled without playwright. + """ + context.log.info(f'pre navigation hook for: {context.request.url}') + + @crawler.pre_navigation_hook(playwright_only=True) + async def hook_playwright(context: AdaptivePlaywrightPreNavCrawlingContext) -> None: + """Hook executed only in playwright sub crawler.""" + + async def some_routing_function(route: Route) -> None: + await route.continue_() + + await context.page.route('*/**', some_routing_function) + context.log.info( + f'Playwright only pre navigation hook for: {context.request.url}' + ) + + await crawler.run(['https://crawlee.dev/']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/playwright_crawler_stagehand/__init__.py b/website/versioned_docs/version-1.6/guides/code_examples/playwright_crawler_stagehand/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/website/versioned_docs/version-1.6/guides/code_examples/playwright_crawler_stagehand/browser_classes.py b/website/versioned_docs/version-1.6/guides/code_examples/playwright_crawler_stagehand/browser_classes.py new file mode 100644 index 0000000000..67b76f1f47 --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/playwright_crawler_stagehand/browser_classes.py @@ -0,0 +1,101 @@ +from __future__ import annotations + +from datetime import datetime, timezone +from typing import TYPE_CHECKING, Any, cast + +from stagehand.context import StagehandContext +from typing_extensions import override + +from crawlee.browsers import ( + PlaywrightBrowserController, + PlaywrightBrowserPlugin, + PlaywrightPersistentBrowser, +) + +from .support_classes import CrawleeStagehandPage + +if TYPE_CHECKING: + from collections.abc import Mapping + + from playwright.async_api import Page + from stagehand import Stagehand + + from crawlee.proxy_configuration import ProxyInfo + + +class StagehandBrowserController(PlaywrightBrowserController): + @override + def __init__( + self, browser: PlaywrightPersistentBrowser, stagehand: Stagehand, **kwargs: Any + ) -> None: + # Initialize with browser context instead of browser instance + super().__init__(browser, **kwargs) + + self._stagehand = stagehand + self._stagehand_context: StagehandContext | None = None + + @override + async def new_page( + self, + browser_new_context_options: Mapping[str, Any] | None = None, + proxy_info: ProxyInfo | None = None, + ) -> Page: + # Initialize browser context if not already done + if not self._browser_context: + self._browser_context = await self._create_browser_context( + browser_new_context_options=browser_new_context_options, + proxy_info=proxy_info, + ) + + # Initialize Stagehand context if not already done + if not self._stagehand_context: + self._stagehand_context = await StagehandContext.init( + self._browser_context, self._stagehand + ) + + # Create a new page using Stagehand context + page = await self._stagehand_context.new_page() + + pw_page = page._page # noqa: SLF001 + + # Handle page close event + pw_page.on(event='close', f=self._on_page_close) + + # Update internal state + self._pages.append(pw_page) + self._last_page_opened_at = datetime.now(timezone.utc) + + self._total_opened_pages += 1 + + # Wrap StagehandPage to provide Playwright Page interface + return cast('Page', CrawleeStagehandPage(page)) + + +class StagehandPlugin(PlaywrightBrowserPlugin): + """Browser plugin that integrates Stagehand with Crawlee's browser management.""" + + @override + def __init__(self, stagehand: Stagehand, **kwargs: Any) -> None: + super().__init__(**kwargs) + + self._stagehand = stagehand + + @override + async def new_browser(self) -> StagehandBrowserController: + if not self._playwright: + raise RuntimeError('Playwright browser plugin is not initialized.') + + browser = PlaywrightPersistentBrowser( + # Stagehand can run only on a Chromium-based browser. + self._playwright.chromium, + self._user_data_dir, + self._browser_launch_options, + ) + + # Return custom controller with Stagehand + return StagehandBrowserController( + browser=browser, + stagehand=self._stagehand, + header_generator=None, + fingerprint_generator=None, + ) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/playwright_crawler_stagehand/stagehand_run.py b/website/versioned_docs/version-1.6/guides/code_examples/playwright_crawler_stagehand/stagehand_run.py new file mode 100644 index 0000000000..6cf8cc2689 --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/playwright_crawler_stagehand/stagehand_run.py @@ -0,0 +1,66 @@ +from __future__ import annotations + +import asyncio +import os +from typing import cast + +from stagehand import StagehandConfig, StagehandPage + +from crawlee import ConcurrencySettings +from crawlee.browsers import BrowserPool +from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext + +from .browser_classes import StagehandPlugin +from .support_classes import CrawleeStagehand + + +async def main() -> None: + # Configure local Stagehand with Gemini model + config = StagehandConfig( + env='LOCAL', + model_name='google/gemini-2.5-flash-preview-05-20', + model_api_key=os.getenv('GEMINI_API_KEY'), + ) + + # Create Stagehand instance + stagehand = CrawleeStagehand(config) + + # Create crawler with custom browser pool using Stagehand + crawler = PlaywrightCrawler( + # Limit the crawl to max requests. Remove or increase it for crawling all links. + max_requests_per_crawl=10, + # Custom browser pool. Gives users full control over browsers used by the crawler. + concurrency_settings=ConcurrencySettings(max_tasks_per_minute=10), + browser_pool=BrowserPool( + plugins=[ + StagehandPlugin(stagehand, browser_launch_options={'headless': True}) + ], + ), + ) + + # Define the default request handler, which will be called for every request. + @crawler.router.default_handler + async def request_handler(context: PlaywrightCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # Cast to StagehandPage for proper type hints in IDE + page = cast('StagehandPage', context.page) + + # Use regular Playwright method + playwright_title = await page.title() + context.log.info(f'Playwright page title: {playwright_title}') + + # highlight-start + # Use AI-powered extraction with natural language + gemini_title = await page.extract('Extract page title') + context.log.info(f'Gemini page title: {gemini_title}') + # highlight-end + + await context.enqueue_links() + + # Run the crawler with the initial list of URLs. + await crawler.run(['https://crawlee.dev/']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/playwright_crawler_stagehand/support_classes.py b/website/versioned_docs/version-1.6/guides/code_examples/playwright_crawler_stagehand/support_classes.py new file mode 100644 index 0000000000..cccb62e989 --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/playwright_crawler_stagehand/support_classes.py @@ -0,0 +1,57 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Any + +from stagehand import Stagehand, StagehandPage + +if TYPE_CHECKING: + from types import TracebackType + + +class CrawleeStagehandPage: + """StagehandPage wrapper for Crawlee.""" + + def __init__(self, page: StagehandPage) -> None: + self._page = page + + async def goto( + self, + url: str, + *, + referer: str | None = None, + timeout: int | None = None, + wait_until: str | None = None, + ) -> Any: + """Navigate to the specified URL.""" + # Override goto to return navigation result that `PlaywrightCrawler` expects + return await self._page._page.goto( # noqa: SLF001 + url, + referer=referer, + timeout=timeout, + wait_until=wait_until, + ) + + def __getattr__(self, name: str) -> Any: + """Delegate all other methods to the underlying StagehandPage.""" + return getattr(self._page, name) + + async def __aenter__(self) -> CrawleeStagehandPage: + """Enter the context manager.""" + return self + + async def __aexit__( + self, + exc_type: type[BaseException] | None, + exc_value: BaseException | None, + exc_traceback: TracebackType | None, + ) -> None: + await self._page.close() + + +class CrawleeStagehand(Stagehand): + """Stagehand wrapper for Crawlee to disable the launch of Playwright.""" + + async def init(self) -> None: + # Skip Stagehand's own Playwright initialization + # Let Crawlee's PlaywrightBrowserPlugin manage the browser lifecycle + self._initialized = True diff --git a/website/versioned_docs/version-1.6/guides/code_examples/proxy_management/inspecting_bs_example.py b/website/versioned_docs/version-1.6/guides/code_examples/proxy_management/inspecting_bs_example.py new file mode 100644 index 0000000000..b6035097d6 --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/proxy_management/inspecting_bs_example.py @@ -0,0 +1,28 @@ +import asyncio + +from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext +from crawlee.proxy_configuration import ProxyConfiguration + + +async def main() -> None: + # Create a ProxyConfiguration object and pass it to the crawler. + proxy_configuration = ProxyConfiguration( + proxy_urls=[ + 'http://proxy-1.com/', + 'http://proxy-2.com/', + ] + ) + crawler = BeautifulSoupCrawler(proxy_configuration=proxy_configuration) + + # Define the default request handler, which will be called for every request. + @crawler.router.default_handler + async def default_handler(context: BeautifulSoupCrawlingContext) -> None: + # Log the proxy used for the current request. + context.log.info(f'Proxy for the current request: {context.proxy_info}') + + # Run the crawler with the initial list of requests. + await crawler.run(['https://crawlee.dev/']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/proxy_management/inspecting_pw_example.py b/website/versioned_docs/version-1.6/guides/code_examples/proxy_management/inspecting_pw_example.py new file mode 100644 index 0000000000..e193972399 --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/proxy_management/inspecting_pw_example.py @@ -0,0 +1,28 @@ +import asyncio + +from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext +from crawlee.proxy_configuration import ProxyConfiguration + + +async def main() -> None: + # Create a ProxyConfiguration object and pass it to the crawler. + proxy_configuration = ProxyConfiguration( + proxy_urls=[ + 'http://proxy-1.com/', + 'http://proxy-2.com/', + ] + ) + crawler = PlaywrightCrawler(proxy_configuration=proxy_configuration) + + # Define the default request handler, which will be called for every request. + @crawler.router.default_handler + async def default_handler(context: PlaywrightCrawlingContext) -> None: + # Log the proxy used for the current request. + context.log.info(f'Proxy for the current request: {context.proxy_info}') + + # Run the crawler with the initial list of requests. + await crawler.run(['https://crawlee.dev/']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/proxy_management/integration_bs_example.py b/website/versioned_docs/version-1.6/guides/code_examples/proxy_management/integration_bs_example.py new file mode 100644 index 0000000000..63a2e703e3 --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/proxy_management/integration_bs_example.py @@ -0,0 +1,32 @@ +import asyncio + +from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext +from crawlee.proxy_configuration import ProxyConfiguration + + +async def main() -> None: + # Create a ProxyConfiguration object and pass it to the crawler. + proxy_configuration = ProxyConfiguration( + proxy_urls=[ + 'http://proxy-1.com/', + 'http://proxy-2.com/', + ] + ) + crawler = BeautifulSoupCrawler(proxy_configuration=proxy_configuration) + + # Define the default request handler, which will be called for every request. + @crawler.router.default_handler + async def default_handler(context: BeautifulSoupCrawlingContext) -> None: + # Extract data from the page. + data = { + 'url': context.request.url, + 'title': context.soup.title.string if context.soup.title else None, + } + context.log.info(f'Extracted data: {data}') + + # Run the crawler with the initial list of requests. + await crawler.run(['https://crawlee.dev/']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/proxy_management/integration_pw_example.py b/website/versioned_docs/version-1.6/guides/code_examples/proxy_management/integration_pw_example.py new file mode 100644 index 0000000000..e8e0ea8821 --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/proxy_management/integration_pw_example.py @@ -0,0 +1,32 @@ +import asyncio + +from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext +from crawlee.proxy_configuration import ProxyConfiguration + + +async def main() -> None: + # Create a ProxyConfiguration object and pass it to the crawler. + proxy_configuration = ProxyConfiguration( + proxy_urls=[ + 'http://proxy-1.com/', + 'http://proxy-2.com/', + ] + ) + crawler = PlaywrightCrawler(proxy_configuration=proxy_configuration) + + # Define the default request handler, which will be called for every request. + @crawler.router.default_handler + async def default_handler(context: PlaywrightCrawlingContext) -> None: + # Extract data from the page. + data = { + 'url': context.request.url, + 'title': await context.page.title(), + } + context.log.info(f'Extracted data: {data}') + + # Run the crawler with the initial list of requests. + await crawler.run(['https://crawlee.dev/']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/proxy_management/quick_start_example.py b/website/versioned_docs/version-1.6/guides/code_examples/proxy_management/quick_start_example.py new file mode 100644 index 0000000000..3b43a48312 --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/proxy_management/quick_start_example.py @@ -0,0 +1,21 @@ +import asyncio + +from crawlee.proxy_configuration import ProxyConfiguration + + +async def main() -> None: + proxy_configuration = ProxyConfiguration( + proxy_urls=[ + 'http://proxy-1.com/', + 'http://proxy-2.com/', + ] + ) + + # The proxy URLs are rotated in a round-robin. + proxy_url_1 = await proxy_configuration.new_url() # http://proxy-1.com/ + proxy_url_2 = await proxy_configuration.new_url() # http://proxy-2.com/ + proxy_url_3 = await proxy_configuration.new_url() # http://proxy-1.com/ + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/proxy_management/session_bs_example.py b/website/versioned_docs/version-1.6/guides/code_examples/proxy_management/session_bs_example.py new file mode 100644 index 0000000000..1243b0e488 --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/proxy_management/session_bs_example.py @@ -0,0 +1,24 @@ +import asyncio + +from crawlee.crawlers import BeautifulSoupCrawler +from crawlee.proxy_configuration import ProxyConfiguration + + +async def main() -> None: + # Create a ProxyConfiguration object and pass it to the crawler. + proxy_configuration = ProxyConfiguration( + proxy_urls=[ + 'http://proxy-1.com/', + 'http://proxy-2.com/', + ] + ) + crawler = BeautifulSoupCrawler( + proxy_configuration=proxy_configuration, + use_session_pool=True, + ) + + # ... + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/proxy_management/session_pw_example.py b/website/versioned_docs/version-1.6/guides/code_examples/proxy_management/session_pw_example.py new file mode 100644 index 0000000000..68309bda59 --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/proxy_management/session_pw_example.py @@ -0,0 +1,24 @@ +import asyncio + +from crawlee.crawlers import PlaywrightCrawler +from crawlee.proxy_configuration import ProxyConfiguration + + +async def main() -> None: + # Create a ProxyConfiguration object and pass it to the crawler. + proxy_configuration = ProxyConfiguration( + proxy_urls=[ + 'http://proxy-1.com/', + 'http://proxy-2.com/', + ] + ) + crawler = PlaywrightCrawler( + proxy_configuration=proxy_configuration, + use_session_pool=True, + ) + + # ... + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/proxy_management/tiers_bs_example.py b/website/versioned_docs/version-1.6/guides/code_examples/proxy_management/tiers_bs_example.py new file mode 100644 index 0000000000..37f69e6419 --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/proxy_management/tiers_bs_example.py @@ -0,0 +1,39 @@ +import asyncio + +from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext +from crawlee.proxy_configuration import ProxyConfiguration + + +async def main() -> None: + # Create a ProxyConfiguration object and pass it to the crawler. + proxy_configuration = ProxyConfiguration( + tiered_proxy_urls=[ + # No proxy tier. + # Optional in case you do not want to use any proxy on lowest tier. + [None], + # lower tier, cheaper, preferred as long as they work + [ + 'http://cheap-datacenter-proxy-1.com/', + 'http://cheap-datacenter-proxy-2.com/', + ], + # higher tier, more expensive, used as a fallback + [ + 'http://expensive-residential-proxy-1.com/', + 'http://expensive-residential-proxy-2.com/', + ], + ] + ) + crawler = BeautifulSoupCrawler(proxy_configuration=proxy_configuration) + + # Define the default request handler, which will be called for every request. + @crawler.router.default_handler + async def default_handler(context: BeautifulSoupCrawlingContext) -> None: + # Log the proxy used for the current request. + context.log.info(f'Proxy for the current request: {context.proxy_info}') + + # Run the crawler with the initial list of requests. + await crawler.run(['https://crawlee.dev/']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/proxy_management/tiers_pw_example.py b/website/versioned_docs/version-1.6/guides/code_examples/proxy_management/tiers_pw_example.py new file mode 100644 index 0000000000..2dcb5ad3bd --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/proxy_management/tiers_pw_example.py @@ -0,0 +1,39 @@ +import asyncio + +from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext +from crawlee.proxy_configuration import ProxyConfiguration + + +async def main() -> None: + # Create a ProxyConfiguration object and pass it to the crawler. + proxy_configuration = ProxyConfiguration( + tiered_proxy_urls=[ + # No proxy tier. + # Optional in case you do not want to use any proxy on lowest tier. + [None], + # lower tier, cheaper, preferred as long as they work + [ + 'http://cheap-datacenter-proxy-1.com/', + 'http://cheap-datacenter-proxy-2.com/', + ], + # higher tier, more expensive, used as a fallback + [ + 'http://expensive-residential-proxy-1.com/', + 'http://expensive-residential-proxy-2.com/', + ], + ] + ) + crawler = PlaywrightCrawler(proxy_configuration=proxy_configuration) + + # Define the default request handler, which will be called for every request. + @crawler.router.default_handler + async def default_handler(context: PlaywrightCrawlingContext) -> None: + # Log the proxy used for the current request. + context.log.info(f'Proxy for the current request: {context.proxy_info}') + + # Run the crawler with the initial list of requests. + await crawler.run(['https://crawlee.dev/']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/request_loaders/rl_basic_example.py b/website/versioned_docs/version-1.6/guides/code_examples/request_loaders/rl_basic_example.py new file mode 100644 index 0000000000..3403673382 --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/request_loaders/rl_basic_example.py @@ -0,0 +1,28 @@ +import asyncio + +from crawlee.request_loaders import RequestList + + +async def main() -> None: + # Open the request list, if it does not exist, it will be created. + # Leave name empty to use the default request list. + request_list = RequestList( + name='my-request-list', + requests=[ + 'https://apify.com/', + 'https://crawlee.dev/', + 'https://crawlee.dev/python/', + ], + ) + + # Fetch and process requests from the queue. + while request := await request_list.fetch_next_request(): + # Do something with it... + print(f'Processing {request.url}') + + # And mark it as handled. + await request_list.mark_request_as_handled(request) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/request_loaders/rl_basic_example_with_persist.py b/website/versioned_docs/version-1.6/guides/code_examples/request_loaders/rl_basic_example_with_persist.py new file mode 100644 index 0000000000..a3d2f89304 --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/request_loaders/rl_basic_example_with_persist.py @@ -0,0 +1,46 @@ +import asyncio +import logging + +from crawlee import service_locator +from crawlee.request_loaders import RequestList + +logging.basicConfig(level=logging.INFO, format='%(asctime)s-%(levelname)s-%(message)s') +logger = logging.getLogger(__name__) + + +# Disable clearing the `KeyValueStore` on each run. +# This is necessary so that the state keys are not cleared at startup. +# The recommended way to achieve this behavior is setting the environment variable +# `CRAWLEE_PURGE_ON_START=0` +configuration = service_locator.get_configuration() +configuration.purge_on_start = False + + +async def main() -> None: + # Open the request list, if it does not exist, it will be created. + # Leave name empty to use the default request list. + request_list = RequestList( + name='my-request-list', + requests=[ + 'https://apify.com/', + 'https://crawlee.dev/', + 'https://crawlee.dev/python/', + ], + # Enable persistence + persist_state_key='my-persist-state', + persist_requests_key='my-persist-requests', + ) + + # We receive only one request. + # Each time you run it, it will be a new request until you exhaust the `RequestList`. + request = await request_list.fetch_next_request() + if request: + logger.info(f'Processing request: {request.url}') + # Do something with it... + + # And mark it as handled. + await request_list.mark_request_as_handled(request) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/request_loaders/rl_tandem_example.py b/website/versioned_docs/version-1.6/guides/code_examples/request_loaders/rl_tandem_example.py new file mode 100644 index 0000000000..d71345b420 --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/request_loaders/rl_tandem_example.py @@ -0,0 +1,43 @@ +import asyncio + +from crawlee.crawlers import ParselCrawler, ParselCrawlingContext +from crawlee.request_loaders import RequestList + + +async def main() -> None: + # Create a static request list. + request_list = RequestList(['https://crawlee.dev', 'https://apify.com']) + + # highlight-start + # Convert the request list to a request manager using the to_tandem method. + # It is a tandem with the default request queue. + request_manager = await request_list.to_tandem() + # highlight-end + + # Create a crawler and pass the request manager to it. + crawler = ParselCrawler( + request_manager=request_manager, + max_requests_per_crawl=10, # Limit the max requests per crawl. + ) + + @crawler.router.default_handler + async def handler(context: ParselCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url}') + + # New links will be enqueued directly to the queue. + await context.enqueue_links() + + # Extract data using Parsel's XPath and CSS selectors. + data = { + 'url': context.request.url, + 'title': context.selector.xpath('//title/text()').get(), + } + + # Push extracted data to the dataset. + await context.push_data(data) + + await crawler.run() + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/request_loaders/rl_tandem_example_explicit.py b/website/versioned_docs/version-1.6/guides/code_examples/request_loaders/rl_tandem_example_explicit.py new file mode 100644 index 0000000000..f3397b7043 --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/request_loaders/rl_tandem_example_explicit.py @@ -0,0 +1,44 @@ +import asyncio + +from crawlee.crawlers import ParselCrawler, ParselCrawlingContext +from crawlee.request_loaders import RequestList, RequestManagerTandem +from crawlee.storages import RequestQueue + + +async def main() -> None: + # Create a static request list. + request_list = RequestList(['https://crawlee.dev', 'https://apify.com']) + + # Open the default request queue. + request_queue = await RequestQueue.open() + + # And combine them together to a sinhle request manager. + request_manager = RequestManagerTandem(request_list, request_queue) + + # Create a crawler and pass the request manager to it. + crawler = ParselCrawler( + request_manager=request_manager, + max_requests_per_crawl=10, # Limit the max requests per crawl. + ) + + @crawler.router.default_handler + async def handler(context: ParselCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url}') + + # New links will be enqueued directly to the queue. + await context.enqueue_links() + + # Extract data using Parsel's XPath and CSS selectors. + data = { + 'url': context.request.url, + 'title': context.selector.xpath('//title/text()').get(), + } + + # Push extracted data to the dataset. + await context.push_data(data) + + await crawler.run() + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/request_loaders/sitemap_basic_example.py b/website/versioned_docs/version-1.6/guides/code_examples/request_loaders/sitemap_basic_example.py new file mode 100644 index 0000000000..07beff458f --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/request_loaders/sitemap_basic_example.py @@ -0,0 +1,30 @@ +import asyncio +import re + +from crawlee.http_clients import ImpitHttpClient +from crawlee.request_loaders import SitemapRequestLoader + + +async def main() -> None: + # Create an HTTP client for fetching the sitemap. + http_client = ImpitHttpClient() + + # Create a sitemap request loader with filtering rules. + sitemap_loader = SitemapRequestLoader( + sitemap_urls=['https://crawlee.dev/sitemap.xml'], + http_client=http_client, + include=[re.compile(r'.*docs.*')], # Only include URLs containing 'docs'. + max_buffer_size=500, # Keep up to 500 URLs in memory before processing. + ) + + # We work with the loader until we process all relevant links from the sitemap. + while request := await sitemap_loader.fetch_next_request(): + # Do something with it... + print(f'Processing {request.url}') + + # And mark it as handled. + await sitemap_loader.mark_request_as_handled(request) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/request_loaders/sitemap_example_with_persist.py b/website/versioned_docs/version-1.6/guides/code_examples/request_loaders/sitemap_example_with_persist.py new file mode 100644 index 0000000000..3b7f3b456d --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/request_loaders/sitemap_example_with_persist.py @@ -0,0 +1,45 @@ +import asyncio +import logging + +from crawlee import service_locator +from crawlee.http_clients import ImpitHttpClient +from crawlee.request_loaders import SitemapRequestLoader + +logging.basicConfig(level=logging.INFO, format='%(asctime)s-%(levelname)s-%(message)s') +logger = logging.getLogger(__name__) + + +# Disable clearing the `KeyValueStore` on each run. +# This is necessary so that the state keys are not cleared at startup. +# The recommended way to achieve this behavior is setting the environment variable +# `CRAWLEE_PURGE_ON_START=0` +configuration = service_locator.get_configuration() +configuration.purge_on_start = False + + +async def main() -> None: + # Create an HTTP client for fetching sitemaps + # Use the context manager for `SitemapRequestLoader` to correctly save the state when + # the work is completed. + async with ( + ImpitHttpClient() as http_client, + SitemapRequestLoader( + sitemap_urls=['https://crawlee.dev/sitemap.xml'], + http_client=http_client, + # Enable persistence + persist_state_key='my-persist-state', + ) as sitemap_loader, + ): + # We receive only one request. + # Each time you run it, it will be a new request until you exhaust the sitemap. + request = await sitemap_loader.fetch_next_request() + if request: + logger.info(f'Processing request: {request.url}') + # Do something with it... + + # And mark it as handled. + await sitemap_loader.mark_request_as_handled(request) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/request_loaders/sitemap_tandem_example.py b/website/versioned_docs/version-1.6/guides/code_examples/request_loaders/sitemap_tandem_example.py new file mode 100644 index 0000000000..bf5fc012b4 --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/request_loaders/sitemap_tandem_example.py @@ -0,0 +1,53 @@ +import asyncio +import re + +from crawlee.crawlers import ParselCrawler, ParselCrawlingContext +from crawlee.http_clients import ImpitHttpClient +from crawlee.request_loaders import SitemapRequestLoader + + +async def main() -> None: + # Create an HTTP client for fetching the sitemap. + http_client = ImpitHttpClient() + + # Create a sitemap request loader with filtering rules. + sitemap_loader = SitemapRequestLoader( + sitemap_urls=['https://crawlee.dev/sitemap.xml'], + http_client=http_client, + include=[re.compile(r'.*docs.*')], # Only include URLs containing 'docs'. + max_buffer_size=500, # Keep up to 500 URLs in memory before processing. + ) + + # highlight-start + # Convert the sitemap loader into a request manager linked + # to the default request queue. + request_manager = await sitemap_loader.to_tandem() + # highlight-end + + # Create a crawler and pass the request manager to it. + crawler = ParselCrawler( + request_manager=request_manager, + max_requests_per_crawl=10, # Limit the max requests per crawl. + ) + + @crawler.router.default_handler + async def handler(context: ParselCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url}') + + # New links will be enqueued directly to the queue. + await context.enqueue_links() + + # Extract data using Parsel's XPath and CSS selectors. + data = { + 'url': context.request.url, + 'title': context.selector.xpath('//title/text()').get(), + } + + # Push extracted data to the dataset. + await context.push_data(data) + + await crawler.run() + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/request_loaders/sitemap_tandem_example_explicit.py b/website/versioned_docs/version-1.6/guides/code_examples/request_loaders/sitemap_tandem_example_explicit.py new file mode 100644 index 0000000000..4b121c9b40 --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/request_loaders/sitemap_tandem_example_explicit.py @@ -0,0 +1,54 @@ +import asyncio +import re + +from crawlee.crawlers import ParselCrawler, ParselCrawlingContext +from crawlee.http_clients import ImpitHttpClient +from crawlee.request_loaders import RequestManagerTandem, SitemapRequestLoader +from crawlee.storages import RequestQueue + + +async def main() -> None: + # Create an HTTP client for fetching the sitemap. + http_client = ImpitHttpClient() + + # Create a sitemap request loader with filtering rules. + sitemap_loader = SitemapRequestLoader( + sitemap_urls=['https://crawlee.dev/sitemap.xml'], + http_client=http_client, + include=[re.compile(r'.*docs.*')], # Only include URLs containing 'docs'. + max_buffer_size=500, # Keep up to 500 URLs in memory before processing. + ) + + # Open the default request queue. + request_queue = await RequestQueue.open() + + # And combine them together to a single request manager. + request_manager = RequestManagerTandem(sitemap_loader, request_queue) + + # Create a crawler and pass the request manager to it. + crawler = ParselCrawler( + request_manager=request_manager, + max_requests_per_crawl=10, # Limit the max requests per crawl. + ) + + @crawler.router.default_handler + async def handler(context: ParselCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url}') + + # New links will be enqueued directly to the queue. + await context.enqueue_links() + + # Extract data using Parsel's XPath and CSS selectors. + data = { + 'url': context.request.url, + 'title': context.selector.xpath('//title/text()').get(), + } + + # Push extracted data to the dataset. + await context.push_data(data) + + await crawler.run() + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/request_router/adaptive_crawler_handlers.py b/website/versioned_docs/version-1.6/guides/code_examples/request_router/adaptive_crawler_handlers.py new file mode 100644 index 0000000000..4814730df6 --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/request_router/adaptive_crawler_handlers.py @@ -0,0 +1,51 @@ +import asyncio + +from crawlee import HttpHeaders +from crawlee.crawlers import ( + AdaptivePlaywrightCrawler, + AdaptivePlaywrightCrawlingContext, + AdaptivePlaywrightPreNavCrawlingContext, +) + + +async def main() -> None: + crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser( + max_requests_per_crawl=10, # Limit the max requests per crawl. + ) + + @crawler.pre_navigation_hook + async def common_setup(context: AdaptivePlaywrightPreNavCrawlingContext) -> None: + # Common pre-navigation hook - runs for both HTTP and browser requests. + context.request.headers |= HttpHeaders( + {'Accept': 'text/html,application/xhtml+xml'}, + ) + + @crawler.pre_navigation_hook(playwright_only=True) + async def browser_setup(context: AdaptivePlaywrightPreNavCrawlingContext) -> None: + # Playwright-specific pre-navigation hook - runs only when browser is used. + await context.page.set_viewport_size({'width': 1280, 'height': 720}) + if context.block_requests: + await context.block_requests(extra_url_patterns=['*.css', '*.js']) + + @crawler.router.default_handler + async def default_handler(context: AdaptivePlaywrightCrawlingContext) -> None: + # Extract title using the unified context interface. + title_tag = context.parsed_content.find('title') + title = title_tag.get_text() if title_tag else None + + # Extract other data consistently across both modes. + links = [a.get('href') for a in context.parsed_content.find_all('a', href=True)] + + await context.push_data( + { + 'url': context.request.url, + 'title': title, + 'links': links, + } + ) + + await crawler.run(['https://crawlee.dev/']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/request_router/basic_request_handlers.py b/website/versioned_docs/version-1.6/guides/code_examples/request_router/basic_request_handlers.py new file mode 100644 index 0000000000..ef88714876 --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/request_router/basic_request_handlers.py @@ -0,0 +1,92 @@ +import asyncio + +from crawlee import Request +from crawlee.crawlers import ParselCrawler, ParselCrawlingContext +from crawlee.router import Router + + +async def main() -> None: + # Create a custom router instance + router = Router[ParselCrawlingContext]() + + # Define the default handler (fallback for requests without specific labels) + @router.default_handler + async def default_handler(context: ParselCrawlingContext) -> None: + context.log.info(f'Processing homepage: {context.request.url}') + + # Extract page title + title = context.selector.css('title::text').get() or 'No title found' + + await context.push_data( + { + 'url': context.request.url, + 'title': title, + 'page_type': 'homepage', + } + ) + + # Find and enqueue collection/category links + await context.enqueue_links(selector='a[href*="/collections/"]', label='CATEGORY') + + # Define a handler for category pages + @router.handler('CATEGORY') + async def category_handler(context: ParselCrawlingContext) -> None: + context.log.info(f'Processing category page: {context.request.url}') + + # Extract category information + category_title = context.selector.css('h1::text').get() or 'Unknown Category' + product_count = len(context.selector.css('.product-item').getall()) + + await context.push_data( + { + 'url': context.request.url, + 'type': 'category', + 'category_title': category_title, + 'product_count': product_count, + 'handler': 'category', + } + ) + + # Enqueue product links from this category + await context.enqueue_links(selector='a[href*="/products/"]', label='PRODUCT') + + # Define a handler for product detail pages + @router.handler('PRODUCT') + async def product_handler(context: ParselCrawlingContext) -> None: + context.log.info(f'Processing product page: {context.request.url}') + + # Extract detailed product information + product_data = { + 'url': context.request.url, + 'name': context.selector.css('h1::text').get(), + 'price': context.selector.css('.price::text').get(), + 'description': context.selector.css('.product-description p::text').get(), + 'images': context.selector.css('.product-gallery img::attr(src)').getall(), + 'in_stock': bool(context.selector.css('.add-to-cart-button').get()), + 'handler': 'product', + } + + await context.push_data(product_data) + + # Create crawler with the router + crawler = ParselCrawler( + request_handler=router, + max_requests_per_crawl=10, # Limit the max requests per crawl. + ) + + # Start crawling with some initial requests + await crawler.run( + [ + # Will use default handler + 'https://warehouse-theme-metal.myshopify.com/', + # Will use category handler + Request.from_url( + 'https://warehouse-theme-metal.myshopify.com/collections/all', + label='CATEGORY', + ), + ] + ) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/request_router/custom_router_default_only.py b/website/versioned_docs/version-1.6/guides/code_examples/request_router/custom_router_default_only.py new file mode 100644 index 0000000000..d6768d5777 --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/request_router/custom_router_default_only.py @@ -0,0 +1,44 @@ +import asyncio + +from crawlee.crawlers import ParselCrawler, ParselCrawlingContext +from crawlee.router import Router + + +async def main() -> None: + # Create a custom router instance + router = Router[ParselCrawlingContext]() + + # Define only a default handler + @router.default_handler + async def default_handler(context: ParselCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url}') + + # Extract page title + title = context.selector.css('title::text').get() or 'No title found' + + # Extract and save basic page data + await context.push_data( + { + 'url': context.request.url, + 'title': title, + } + ) + + # Find and enqueue product links for further crawling + await context.enqueue_links( + selector='a[href*="/products/"]', + label='PRODUCT', # Note: no handler for this label, will use default + ) + + # Create crawler with the custom router + crawler = ParselCrawler( + request_handler=router, + max_requests_per_crawl=10, # Limit the max requests per crawl. + ) + + # Start crawling + await crawler.run(['https://warehouse-theme-metal.myshopify.com/']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/request_router/error_handler.py b/website/versioned_docs/version-1.6/guides/code_examples/request_router/error_handler.py new file mode 100644 index 0000000000..b240e72eca --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/request_router/error_handler.py @@ -0,0 +1,62 @@ +import asyncio + +from crawlee.crawlers import BasicCrawlingContext, ParselCrawler, ParselCrawlingContext +from crawlee.errors import HttpStatusCodeError + +# HTTP status code constants +TOO_MANY_REQUESTS = 429 + + +async def main() -> None: + # Create a crawler instance + crawler = ParselCrawler( + max_requests_per_crawl=10, # Limit the max requests per crawl. + ) + + @crawler.router.default_handler + async def default_handler(context: ParselCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url}') + + # Extract product information (might fail for some pages) + product_name = context.selector.css('h1[data-testid="product-title"]::text').get() + if not product_name: + raise ValueError('Product name not found - might be a non-product page') + + price = context.selector.css('.price::text').get() + await context.push_data( + { + 'url': context.request.url, + 'product_name': product_name, + 'price': price, + } + ) + + # Error handler - called when an error occurs during request processing + @crawler.error_handler + async def error_handler(context: BasicCrawlingContext, error: Exception) -> None: + error_name = type(error).__name__ + context.log.warning(f'Error occurred for {context.request.url}: {error_name}') + + # You can modify the request or context here before retry + if ( + isinstance(error, HttpStatusCodeError) + and error.status_code == TOO_MANY_REQUESTS + ): + context.log.info('Rate limited - will retry with delay') + # You could modify headers, add delay, etc. + elif isinstance(error, ValueError): + context.log.info('Parse error - marking request as no retry') + context.request.no_retry = True + + # Start crawling + await crawler.run( + [ + 'https://warehouse-theme-metal.myshopify.com/products/on-running-cloudmonster-2-mens', + # Might cause parse error + 'https://warehouse-theme-metal.myshopify.com/collections/mens-running', + ] + ) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/request_router/failed_request_handler.py b/website/versioned_docs/version-1.6/guides/code_examples/request_router/failed_request_handler.py new file mode 100644 index 0000000000..e09940b990 --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/request_router/failed_request_handler.py @@ -0,0 +1,64 @@ +import asyncio + +from crawlee.crawlers import BasicCrawlingContext, ParselCrawler, ParselCrawlingContext + + +async def main() -> None: + # Create a crawler instance with retry settings + crawler = ParselCrawler( + max_requests_per_crawl=10, # Limit the max requests per crawl. + max_request_retries=2, # Allow 2 retries before failing + ) + + @crawler.router.default_handler + async def default_handler(context: ParselCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url}') + + # Extract product information + product_name = context.selector.css('h1[data-testid="product-title"]::text').get() + if not product_name: + product_name = context.selector.css('h1::text').get() or 'Unknown Product' + + price = context.selector.css('.price::text').get() or 'Price not available' + + await context.push_data( + { + 'url': context.request.url, + 'product_name': product_name, + 'price': price, + 'status': 'success', + } + ) + + # Failed request handler - called when request has exhausted all retries + @crawler.failed_request_handler + async def failed_handler(context: BasicCrawlingContext, error: Exception) -> None: + context.log.error( + f'Failed to process {context.request.url} after all retries: {error}' + ) + + # Save failed request information for analysis + await context.push_data( + { + 'failed_url': context.request.url, + 'label': context.request.label, + 'error_type': type(error).__name__, + 'error_message': str(error), + 'retry_count': context.request.retry_count, + 'status': 'failed', + } + ) + + # Start crawling with some URLs that might fail + await crawler.run( + [ + 'https://warehouse-theme-metal.myshopify.com/products/on-running-cloudmonster-2-mens', + # This will likely fail + 'https://warehouse-theme-metal.myshopify.com/invalid-url', + 'https://warehouse-theme-metal.myshopify.com/products/valid-product', + ] + ) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/request_router/http_pre_navigation.py b/website/versioned_docs/version-1.6/guides/code_examples/request_router/http_pre_navigation.py new file mode 100644 index 0000000000..84926f6fe4 --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/request_router/http_pre_navigation.py @@ -0,0 +1,37 @@ +import asyncio + +from crawlee import HttpHeaders +from crawlee.crawlers import BasicCrawlingContext, ParselCrawler, ParselCrawlingContext + + +async def main() -> None: + crawler = ParselCrawler( + max_requests_per_crawl=10, # Limit the max requests per crawl. + ) + + @crawler.pre_navigation_hook + async def setup_request(context: BasicCrawlingContext) -> None: + # Add custom headers before making the request + context.request.headers |= HttpHeaders( + { + 'User-Agent': 'Crawlee Bot 1.0', + 'Accept': 'text/html,application/xhtml+xml', + }, + ) + + @crawler.router.default_handler + async def default_handler(context: ParselCrawlingContext) -> None: + # Extract basic page information + title = context.selector.css('title::text').get() + await context.push_data( + { + 'url': context.request.url, + 'title': title, + } + ) + + await crawler.run(['https://warehouse-theme-metal.myshopify.com/']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/request_router/playwright_pre_navigation.py b/website/versioned_docs/version-1.6/guides/code_examples/request_router/playwright_pre_navigation.py new file mode 100644 index 0000000000..aab49717ee --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/request_router/playwright_pre_navigation.py @@ -0,0 +1,59 @@ +import asyncio + +from crawlee.crawlers import ( + PlaywrightCrawler, + PlaywrightCrawlingContext, + PlaywrightPreNavCrawlingContext, +) + + +async def main() -> None: + crawler = PlaywrightCrawler( + max_requests_per_crawl=10, # Limit the max requests per crawl. + ) + + @crawler.pre_navigation_hook + async def setup_page(context: PlaywrightPreNavCrawlingContext) -> None: + # Set viewport size for consistent rendering + await context.page.set_viewport_size({'width': 1280, 'height': 720}) + + # Block unnecessary resources to speed up crawling + await context.block_requests( + extra_url_patterns=[ + '*.png', + '*.jpg', + '*.jpeg', + '*.gif', + '*.svg', + '*.css', + '*.woff', + '*.woff2', + '*.ttf', + '*google-analytics*', + '*facebook*', + '*twitter*', + ] + ) + + # Set custom user agent + await context.page.set_extra_http_headers( + { + 'User-Agent': 'Mozilla/5.0 (compatible; Crawlee Bot)', + } + ) + + @crawler.router.default_handler + async def default_handler(context: PlaywrightCrawlingContext) -> None: + title = await context.page.title() + await context.push_data( + { + 'url': context.request.url, + 'title': title, + } + ) + + await crawler.run(['https://crawlee.dev/']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/request_router/simple_default_handler.py b/website/versioned_docs/version-1.6/guides/code_examples/request_router/simple_default_handler.py new file mode 100644 index 0000000000..92c35651a1 --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/request_router/simple_default_handler.py @@ -0,0 +1,36 @@ +import asyncio + +from crawlee.crawlers import ParselCrawler, ParselCrawlingContext + + +async def main() -> None: + # Create a crawler instance + crawler = ParselCrawler( + max_requests_per_crawl=10, # Limit the max requests per crawl. + ) + + # Use the crawler's built-in router to define a default handler + @crawler.router.default_handler + async def default_handler(context: ParselCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url}') + + # Extract page title + title = context.selector.css('title::text').get() or 'No title found' + + # Extract and save basic page data + await context.push_data( + { + 'url': context.request.url, + 'title': title, + } + ) + + # Find and enqueue product links for further crawling + await context.enqueue_links(selector='a[href*="/products/"]', label='PRODUCT') + + # Start crawling + await crawler.run(['https://warehouse-theme-metal.myshopify.com/']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/running_in_web_server/__init__.py b/website/versioned_docs/version-1.6/guides/code_examples/running_in_web_server/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/website/versioned_docs/version-1.6/guides/code_examples/running_in_web_server/crawler.py b/website/versioned_docs/version-1.6/guides/code_examples/running_in_web_server/crawler.py new file mode 100644 index 0000000000..37c6671856 --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/running_in_web_server/crawler.py @@ -0,0 +1,54 @@ +import asyncio +from collections.abc import AsyncIterator +from contextlib import asynccontextmanager +from typing import TypedDict + +from fastapi import FastAPI + +from crawlee.crawlers import ParselCrawler, ParselCrawlingContext + + +class State(TypedDict): + """State available in the app.""" + + crawler: ParselCrawler + requests_to_results: dict[str, asyncio.Future[dict[str, str]]] + + +@asynccontextmanager +async def lifespan(app: FastAPI) -> AsyncIterator[State]: + # Start up code that runs once when the app starts + + # Results will be stored in this dictionary + requests_to_results = dict[str, asyncio.Future[dict[str, str]]]() + + crawler = ParselCrawler( + # Keep the crawler alive even when there are no more requests to process now. + # This makes the crawler wait for more requests to be added later. + keep_alive=True + ) + + # Define the default request handler, which will be called for every request. + @crawler.router.default_handler + async def request_handler(context: ParselCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + title = context.selector.xpath('//title/text()').get() or '' + + # Extract data from the page and save it to the result dictionary. + requests_to_results[context.request.unique_key].set_result( + { + 'title': title, + } + ) + + # Start the crawler without awaiting it to finish + crawler.log.info(f'Starting crawler for the {app.title}') + run_task = asyncio.create_task(crawler.run([])) + + # Make the crawler and the result dictionary available in the app state + yield {'crawler': crawler, 'requests_to_results': requests_to_results} + + # Cleanup code that runs once when the app shuts down + crawler.stop() + # Wait for the crawler to finish + await run_task diff --git a/website/versioned_docs/version-1.6/guides/code_examples/running_in_web_server/server.py b/website/versioned_docs/version-1.6/guides/code_examples/running_in_web_server/server.py new file mode 100644 index 0000000000..64e192af37 --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/running_in_web_server/server.py @@ -0,0 +1,58 @@ +from __future__ import annotations + +import asyncio +from uuid import uuid4 + +from fastapi import FastAPI +from starlette.requests import Request +from starlette.responses import HTMLResponse + +import crawlee + +from .crawler import lifespan + +app = FastAPI(lifespan=lifespan, title='Crawler app') + + +@app.get('/', response_class=HTMLResponse) +def index() -> str: + return """ + + + +

Scraper server

+

To scrape some page, visit "scrape" endpoint with url parameter. + For example: + + /scrape?url=https://www.example.com + +

+ + +""" + + +@app.get('/scrape') +async def scrape_url(request: Request, url: str | None = None) -> dict: + if not url: + return {'url': 'missing', 'scrape result': 'no results'} + + # Generate random unique key for the request + unique_key = str(uuid4()) + + # Set the result future in the result dictionary so that it can be awaited + request.state.requests_to_results[unique_key] = asyncio.Future[dict[str, str]]() + + # Add the request to the crawler queue + await request.state.crawler.add_requests( + [crawlee.Request.from_url(url, unique_key=unique_key)] + ) + + # Wait for the result future to be finished + result = await request.state.requests_to_results[unique_key] + + # Clean the result from the result dictionary to free up memory + request.state.requests_to_results.pop(unique_key) + + # Return the result + return {'url': url, 'scrape result': result} diff --git a/website/versioned_docs/version-1.6/guides/code_examples/scaling_crawlers/max_tasks_per_minute_example.py b/website/versioned_docs/version-1.6/guides/code_examples/scaling_crawlers/max_tasks_per_minute_example.py new file mode 100644 index 0000000000..cbc1130bc7 --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/scaling_crawlers/max_tasks_per_minute_example.py @@ -0,0 +1,25 @@ +import asyncio + +from crawlee import ConcurrencySettings +from crawlee.crawlers import BeautifulSoupCrawler + + +async def main() -> None: + concurrency_settings = ConcurrencySettings( + # Set the maximum number of concurrent requests the crawler can run to 100. + max_concurrency=100, + # Limit the total number of requests to 10 per minute to avoid overwhelming + # the target website. + max_tasks_per_minute=10, + ) + + crawler = BeautifulSoupCrawler( + # Apply the defined concurrency settings to the crawler. + concurrency_settings=concurrency_settings, + ) + + # ... + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/scaling_crawlers/min_and_max_concurrency_example.py b/website/versioned_docs/version-1.6/guides/code_examples/scaling_crawlers/min_and_max_concurrency_example.py new file mode 100644 index 0000000000..4d491446d0 --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/scaling_crawlers/min_and_max_concurrency_example.py @@ -0,0 +1,27 @@ +import asyncio + +from crawlee import ConcurrencySettings +from crawlee.crawlers import BeautifulSoupCrawler + + +async def main() -> None: + concurrency_settings = ConcurrencySettings( + # Start with 8 concurrent tasks, as long as resources are available. + desired_concurrency=8, + # Maintain a minimum of 5 concurrent tasks to ensure steady crawling. + min_concurrency=5, + # Limit the maximum number of concurrent tasks to 10 to prevent + # overloading the system. + max_concurrency=10, + ) + + crawler = BeautifulSoupCrawler( + # Use the configured concurrency settings for the crawler. + concurrency_settings=concurrency_settings, + ) + + # ... + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/service_locator/service_conflicts.py b/website/versioned_docs/version-1.6/guides/code_examples/service_locator/service_conflicts.py new file mode 100644 index 0000000000..52bcbbe8e9 --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/service_locator/service_conflicts.py @@ -0,0 +1,22 @@ +import asyncio + +from crawlee import service_locator +from crawlee.storage_clients import FileSystemStorageClient, MemoryStorageClient + + +async def main() -> None: + # Register the storage client via service locator. + memory_storage_client = MemoryStorageClient() + service_locator.set_storage_client(memory_storage_client) + + # Retrieve the storage client. + current_storage_client = service_locator.get_storage_client() + + # Try to set a different storage client, which will raise ServiceConflictError + # if storage client was already retrieved. + file_system_storage_client = FileSystemStorageClient() + service_locator.set_storage_client(file_system_storage_client) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/service_locator/service_crawler_configuration.py b/website/versioned_docs/version-1.6/guides/code_examples/service_locator/service_crawler_configuration.py new file mode 100644 index 0000000000..50b13fee71 --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/service_locator/service_crawler_configuration.py @@ -0,0 +1,22 @@ +import asyncio +from datetime import timedelta + +from crawlee.configuration import Configuration +from crawlee.crawlers import ParselCrawler + + +async def main() -> None: + configuration = Configuration( + log_level='DEBUG', + headless=False, + persist_state_interval=timedelta(seconds=30), + ) + + # Register configuration via crawler. + crawler = ParselCrawler( + configuration=configuration, + ) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/service_locator/service_crawler_event_manager.py b/website/versioned_docs/version-1.6/guides/code_examples/service_locator/service_crawler_event_manager.py new file mode 100644 index 0000000000..e8a82f4f0e --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/service_locator/service_crawler_event_manager.py @@ -0,0 +1,20 @@ +import asyncio +from datetime import timedelta + +from crawlee.crawlers import ParselCrawler +from crawlee.events import LocalEventManager + + +async def main() -> None: + event_manager = LocalEventManager( + system_info_interval=timedelta(seconds=5), + ) + + # Register event manager via crawler. + crawler = ParselCrawler( + event_manager=event_manager, + ) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/service_locator/service_crawler_storage_client.py b/website/versioned_docs/version-1.6/guides/code_examples/service_locator/service_crawler_storage_client.py new file mode 100644 index 0000000000..76fe923877 --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/service_locator/service_crawler_storage_client.py @@ -0,0 +1,17 @@ +import asyncio + +from crawlee.crawlers import ParselCrawler +from crawlee.storage_clients import MemoryStorageClient + + +async def main() -> None: + storage_client = MemoryStorageClient() + + # Register storage client via crawler. + crawler = ParselCrawler( + storage_client=storage_client, + ) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/service_locator/service_locator_configuration.py b/website/versioned_docs/version-1.6/guides/code_examples/service_locator/service_locator_configuration.py new file mode 100644 index 0000000000..bb3f429eed --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/service_locator/service_locator_configuration.py @@ -0,0 +1,20 @@ +import asyncio +from datetime import timedelta + +from crawlee import service_locator +from crawlee.configuration import Configuration + + +async def main() -> None: + configuration = Configuration( + log_level='DEBUG', + headless=False, + persist_state_interval=timedelta(seconds=30), + ) + + # Register configuration via service locator. + service_locator.set_configuration(configuration) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/service_locator/service_locator_event_manager.py b/website/versioned_docs/version-1.6/guides/code_examples/service_locator/service_locator_event_manager.py new file mode 100644 index 0000000000..3d98a8cf55 --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/service_locator/service_locator_event_manager.py @@ -0,0 +1,18 @@ +import asyncio +from datetime import timedelta + +from crawlee import service_locator +from crawlee.events import LocalEventManager + + +async def main() -> None: + event_manager = LocalEventManager( + system_info_interval=timedelta(seconds=5), + ) + + # Register event manager via service locator. + service_locator.set_event_manager(event_manager) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/service_locator/service_locator_storage_client.py b/website/versioned_docs/version-1.6/guides/code_examples/service_locator/service_locator_storage_client.py new file mode 100644 index 0000000000..4dcad08420 --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/service_locator/service_locator_storage_client.py @@ -0,0 +1,15 @@ +import asyncio + +from crawlee import service_locator +from crawlee.storage_clients import MemoryStorageClient + + +async def main() -> None: + storage_client = MemoryStorageClient() + + # Register storage client via service locator. + service_locator.set_storage_client(storage_client) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/service_locator/service_storage_configuration.py b/website/versioned_docs/version-1.6/guides/code_examples/service_locator/service_storage_configuration.py new file mode 100644 index 0000000000..580e6d348f --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/service_locator/service_storage_configuration.py @@ -0,0 +1,30 @@ +import asyncio +from datetime import timedelta + +from crawlee import service_locator +from crawlee.configuration import Configuration +from crawlee.storage_clients import MemoryStorageClient +from crawlee.storages import Dataset + + +async def main() -> None: + configuration = Configuration( + log_level='DEBUG', + headless=False, + persist_state_interval=timedelta(seconds=30), + ) + # Set the custom configuration as the global default configuration. + service_locator.set_configuration(configuration) + + # Use the global defaults when creating the dataset (or other storage). + dataset_1 = await Dataset.open() + + # Or set explicitly specific configuration if + # you do not want to rely on global defaults. + dataset_2 = await Dataset.open( + storage_client=MemoryStorageClient(), configuration=configuration + ) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/service_locator/service_storage_storage_client.py b/website/versioned_docs/version-1.6/guides/code_examples/service_locator/service_storage_storage_client.py new file mode 100644 index 0000000000..02a0853d44 --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/service_locator/service_storage_storage_client.py @@ -0,0 +1,17 @@ +import asyncio + +from crawlee.storage_clients import MemoryStorageClient +from crawlee.storages import Dataset + + +async def main() -> None: + storage_client = MemoryStorageClient() + + # Pass the storage client to the dataset (or other storage) when opening it. + dataset = await Dataset.open( + storage_client=storage_client, + ) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/session_management/multi_sessions_http.py b/website/versioned_docs/version-1.6/guides/code_examples/session_management/multi_sessions_http.py new file mode 100644 index 0000000000..0bd4a88beb --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/session_management/multi_sessions_http.py @@ -0,0 +1,85 @@ +import asyncio +from collections.abc import Callable +from datetime import timedelta +from itertools import count + +from crawlee import ConcurrencySettings, Request +from crawlee.crawlers import BasicCrawlingContext, HttpCrawler, HttpCrawlingContext +from crawlee.errors import RequestCollisionError +from crawlee.sessions import Session, SessionPool + + +# Define a function for creating sessions with simple logic for unique `id` generation. +# This is necessary if you need to specify a particular session for the first request, +# for example during authentication +def create_session_function() -> Callable[[], Session]: + counter = count() + + def create_session() -> Session: + return Session( + id=str(next(counter)), + max_usage_count=999_999, + max_age=timedelta(hours=999_999), + max_error_score=100, + blocked_status_codes=[403], + ) + + return create_session + + +async def main() -> None: + crawler = HttpCrawler( + # Adjust request limits according to your pool size + concurrency_settings=ConcurrencySettings(max_tasks_per_minute=500), + # Requests are bound to specific sessions, no rotation needed + max_session_rotations=0, + session_pool=SessionPool( + max_pool_size=10, create_session_function=create_session_function() + ), + ) + + @crawler.router.default_handler + async def basic_handler(context: HttpCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url}') + + # Initialize the session and bind the next request to this session if needed + @crawler.router.handler(label='session_init') + async def session_init(context: HttpCrawlingContext) -> None: + next_requests = [] + if context.session: + context.log.info(f'Init session {context.session.id}') + next_request = Request.from_url( + 'https://a.placeholder.com', session_id=context.session.id + ) + next_requests.append(next_request) + + await context.add_requests(next_requests) + + # Handle errors when a session is blocked and no longer available in the pool + # when attempting to execute requests bound to it + @crawler.failed_request_handler + async def error_processing(context: BasicCrawlingContext, error: Exception) -> None: + if isinstance(error, RequestCollisionError) and context.session: + context.log.error( + f'Request {context.request.url} failed, because the bound ' + 'session is unavailable' + ) + + # Create a pool of requests bound to their respective sessions + # Use `always_enqueue=True` if session initialization happens on a non-unique address, + # such as the site's main page + init_requests = [ + Request.from_url( + 'https://example.org/', + label='session_init', + session_id=str(session_id), + use_extended_unique_key=True, + ) + for session_id in range(1, 11) + ] + + await crawler.run(init_requests) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/session_management/one_session_http.py b/website/versioned_docs/version-1.6/guides/code_examples/session_management/one_session_http.py new file mode 100644 index 0000000000..28cec44b63 --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/session_management/one_session_http.py @@ -0,0 +1,56 @@ +import asyncio +from datetime import timedelta + +from crawlee import ConcurrencySettings, Request +from crawlee.crawlers import BasicCrawlingContext, HttpCrawler, HttpCrawlingContext +from crawlee.errors import SessionError +from crawlee.sessions import SessionPool + + +async def main() -> None: + crawler = HttpCrawler( + # Limit requests per minute to reduce the chance of being blocked + concurrency_settings=ConcurrencySettings(max_tasks_per_minute=50), + # Disable session rotation + max_session_rotations=0, + session_pool=SessionPool( + # Only one session in the pool + max_pool_size=1, + create_session_settings={ + # High value for session usage limit + 'max_usage_count': 999_999, + # High value for session lifetime + 'max_age': timedelta(hours=999_999), + # High score allows the session to encounter more errors + # before crawlee decides the session is blocked + # Make sure you know how to handle these errors + 'max_error_score': 100, + # 403 status usually indicates you're already blocked + 'blocked_status_codes': [403], + }, + ), + ) + + # Basic request handling logic + @crawler.router.default_handler + async def basic_handler(context: HttpCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url}') + + # Handler for session initialization (authentication, initial cookies, etc.) + @crawler.router.handler(label='session_init') + async def session_init(context: HttpCrawlingContext) -> None: + if context.session: + context.log.info(f'Init session {context.session.id}') + + # Monitor if our session gets blocked and explicitly stop the crawler + @crawler.error_handler + async def error_processing(context: BasicCrawlingContext, error: Exception) -> None: + if isinstance(error, SessionError) and context.session: + context.log.info(f'Session {context.session.id} blocked') + crawler.stop() + + await crawler.run([Request.from_url('https://example.org/', label='session_init')]) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/session_management/sm_basic.py b/website/versioned_docs/version-1.6/guides/code_examples/session_management/sm_basic.py new file mode 100644 index 0000000000..30e1d7ae92 --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/session_management/sm_basic.py @@ -0,0 +1,48 @@ +import asyncio +import re + +from crawlee.crawlers import BasicCrawler, BasicCrawlingContext +from crawlee.proxy_configuration import ProxyConfiguration +from crawlee.sessions import SessionPool + + +async def main() -> None: + # To use the proxy IP session rotation logic, you must turn the proxy usage on. + proxy_configuration = ProxyConfiguration( + # options + ) + + # Initialize crawler with a custom SessionPool configuration + # to manage concurrent sessions and proxy rotation + crawler = BasicCrawler( + proxy_configuration=proxy_configuration, + # Activates the Session pool (default is true). + use_session_pool=True, + # Overrides default Session pool configuration. + session_pool=SessionPool(max_pool_size=100), + ) + + # Define the default request handler that manages session states + @crawler.router.default_handler + async def default_handler(context: BasicCrawlingContext) -> None: + # Send request, BasicCrawler automatically selects a session from the pool + # and sets a proxy for it. You can check it with `context.session` + # and `context.proxy_info`. + response = await context.send_request(context.request.url) + + page_content = (await response.read()).decode() + title_match = re.search(r'(.*?)', page_content) + + if context.session and (title := title_match.group(1) if title_match else None): + if title == 'Blocked': + context.session.retire() + elif title == 'Not sure if blocked, might also be a connection error': + context.session.mark_bad() + else: + context.session.mark_good() # BasicCrawler handles this automatically. + + await crawler.run(['https://crawlee.dev/']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/session_management/sm_beautifulsoup.py b/website/versioned_docs/version-1.6/guides/code_examples/session_management/sm_beautifulsoup.py new file mode 100644 index 0000000000..a54fd8425f --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/session_management/sm_beautifulsoup.py @@ -0,0 +1,42 @@ +import asyncio + +from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext +from crawlee.proxy_configuration import ProxyConfiguration +from crawlee.sessions import SessionPool + + +async def main() -> None: + # To use the proxy IP session rotation logic, you must turn the proxy usage on. + proxy_configuration = ProxyConfiguration( + # options + ) + + # Initialize crawler with a custom SessionPool configuration + # to manage concurrent sessions and proxy rotation + crawler = BeautifulSoupCrawler( + proxy_configuration=proxy_configuration, + # Activates the Session pool (default is true). + use_session_pool=True, + # Overrides default Session pool configuration. + session_pool=SessionPool(max_pool_size=100), + ) + + # Define the default request handler that manages session states + # based on the response content and potential blocking + @crawler.router.default_handler + async def default_handler(context: BeautifulSoupCrawlingContext) -> None: + title = context.soup.title.get_text() if context.soup.title else None + + if context.session: + if title == 'Blocked': + context.session.retire() + elif title == 'Not sure if blocked, might also be a connection error': + context.session.mark_bad() + else: + context.session.mark_good() # BasicCrawler handles this automatically. + + await crawler.run(['https://crawlee.dev/']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/session_management/sm_http.py b/website/versioned_docs/version-1.6/guides/code_examples/session_management/sm_http.py new file mode 100644 index 0000000000..9497594d3b --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/session_management/sm_http.py @@ -0,0 +1,44 @@ +import asyncio +import re + +from crawlee.crawlers import HttpCrawler, HttpCrawlingContext +from crawlee.proxy_configuration import ProxyConfiguration +from crawlee.sessions import SessionPool + + +async def main() -> None: + # To use the proxy IP session rotation logic, you must turn the proxy usage on. + proxy_configuration = ProxyConfiguration( + # options + ) + + # Initialize crawler with a custom SessionPool configuration + # to manage concurrent sessions and proxy rotation + crawler = HttpCrawler( + proxy_configuration=proxy_configuration, + # Activates the Session pool (default is true). + use_session_pool=True, + # Overrides default Session pool configuration. + session_pool=SessionPool(max_pool_size=100), + ) + + # Define the default request handler that manages session states + # based on the response content and potential blocking + @crawler.router.default_handler + async def default_handler(context: HttpCrawlingContext) -> None: + page_content = (await context.http_response.read()).decode() + title_match = re.search(r'(.*?)', page_content) + + if context.session and (title := title_match.group(1) if title_match else None): + if title == 'Blocked': + context.session.retire() + elif title == 'Not sure if blocked, might also be a connection error': + context.session.mark_bad() + else: + context.session.mark_good() # BasicCrawler handles this automatically. + + await crawler.run(['https://crawlee.dev/']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/session_management/sm_parsel.py b/website/versioned_docs/version-1.6/guides/code_examples/session_management/sm_parsel.py new file mode 100644 index 0000000000..66752a63c3 --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/session_management/sm_parsel.py @@ -0,0 +1,42 @@ +import asyncio + +from crawlee.crawlers import ParselCrawler, ParselCrawlingContext +from crawlee.proxy_configuration import ProxyConfiguration +from crawlee.sessions import SessionPool + + +async def main() -> None: + # To use the proxy IP session rotation logic, you must turn the proxy usage on. + proxy_configuration = ProxyConfiguration( + # options + ) + + # Initialize crawler with a custom SessionPool configuration + # to manage concurrent sessions and proxy rotation + crawler = ParselCrawler( + proxy_configuration=proxy_configuration, + # Activates the Session pool (default is true). + use_session_pool=True, + # Overrides default Session pool configuration. + session_pool=SessionPool(max_pool_size=100), + ) + + # Define the default request handler that manages session states + # based on the response content and potential blocking + @crawler.router.default_handler + async def default_handler(context: ParselCrawlingContext) -> None: + title = context.selector.css('title::text').get() + + if context.session: + if title == 'Blocked': + context.session.retire() + elif title == 'Not sure if blocked, might also be a connection error': + context.session.mark_bad() + else: + context.session.mark_good() # BasicCrawler handles this automatically. + + await crawler.run(['https://crawlee.dev/']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/session_management/sm_playwright.py b/website/versioned_docs/version-1.6/guides/code_examples/session_management/sm_playwright.py new file mode 100644 index 0000000000..46a4c4f096 --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/session_management/sm_playwright.py @@ -0,0 +1,42 @@ +import asyncio + +from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext +from crawlee.proxy_configuration import ProxyConfiguration +from crawlee.sessions import SessionPool + + +async def main() -> None: + # To use the proxy IP session rotation logic, you must turn the proxy usage on. + proxy_configuration = ProxyConfiguration( + # options + ) + + # Initialize crawler with a custom SessionPool configuration + # to manage concurrent sessions and proxy rotation + crawler = PlaywrightCrawler( + proxy_configuration=proxy_configuration, + # Activates the Session pool (default is true). + use_session_pool=True, + # Overrides default Session pool configuration. + session_pool=SessionPool(max_pool_size=100), + ) + + # Define the default request handler that manages session states + # based on the response content and potential blocking + @crawler.router.default_handler + async def default_handler(context: PlaywrightCrawlingContext) -> None: + title = await context.page.title() + + if context.session: + if title == 'Blocked': + context.session.retire() + elif title == 'Not sure if blocked, might also be a connection error': + context.session.mark_bad() + else: + context.session.mark_good() # BasicCrawler handles this automatically. + + await crawler.run(['https://crawlee.dev/']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/session_management/sm_standalone.py b/website/versioned_docs/version-1.6/guides/code_examples/session_management/sm_standalone.py new file mode 100644 index 0000000000..32989dc7e0 --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/session_management/sm_standalone.py @@ -0,0 +1,25 @@ +import asyncio + +from crawlee.sessions import SessionPool + + +async def main() -> None: + # Override the default Session pool configuration. + async with SessionPool( + max_pool_size=100, + create_session_settings={'max_usage_count': 10, 'blocked_status_codes': [403]}, + ) as session_pool: + session = await session_pool.get_session() + + # Increase the error_score. + session.mark_bad() + + # Throw away the session. + session.retire() + + # Lower the error_score and mark the session good. + session.mark_good() + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/storage_clients/custom_storage_client_example.py b/website/versioned_docs/version-1.6/guides/code_examples/storage_clients/custom_storage_client_example.py new file mode 100644 index 0000000000..271b83d811 --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/storage_clients/custom_storage_client_example.py @@ -0,0 +1,65 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from crawlee.storage_clients import StorageClient +from crawlee.storage_clients._base import ( + DatasetClient, + KeyValueStoreClient, + RequestQueueClient, +) + +if TYPE_CHECKING: + from crawlee.configuration import Configuration + +# Implement the storage type clients with your backend logic. + + +class CustomDatasetClient(DatasetClient): + # Implement methods like push_data, get_data, iterate_items, etc. + pass + + +class CustomKeyValueStoreClient(KeyValueStoreClient): + # Implement methods like get_value, set_value, delete, etc. + pass + + +class CustomRequestQueueClient(RequestQueueClient): + # Implement methods like add_request, fetch_next_request, etc. + pass + + +# Implement the storage client factory. + + +class CustomStorageClient(StorageClient): + async def create_dataset_client( + self, + *, + id: str | None = None, + name: str | None = None, + configuration: Configuration | None = None, + ) -> CustomDatasetClient: + # Create and return your custom dataset client. + pass + + async def create_kvs_client( + self, + *, + id: str | None = None, + name: str | None = None, + configuration: Configuration | None = None, + ) -> CustomKeyValueStoreClient: + # Create and return your custom key-value store client. + pass + + async def create_rq_client( + self, + *, + id: str | None = None, + name: str | None = None, + configuration: Configuration | None = None, + ) -> CustomRequestQueueClient: + # Create and return your custom request queue client. + pass diff --git a/website/versioned_docs/version-1.6/guides/code_examples/storage_clients/file_system_storage_client_basic_example.py b/website/versioned_docs/version-1.6/guides/code_examples/storage_clients/file_system_storage_client_basic_example.py new file mode 100644 index 0000000000..62969f8024 --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/storage_clients/file_system_storage_client_basic_example.py @@ -0,0 +1,8 @@ +from crawlee.crawlers import ParselCrawler +from crawlee.storage_clients import FileSystemStorageClient + +# Create a new instance of storage client. +storage_client = FileSystemStorageClient() + +# And pass it to the crawler. +crawler = ParselCrawler(storage_client=storage_client) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/storage_clients/file_system_storage_client_configuration_example.py b/website/versioned_docs/version-1.6/guides/code_examples/storage_clients/file_system_storage_client_configuration_example.py new file mode 100644 index 0000000000..1d3507660f --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/storage_clients/file_system_storage_client_configuration_example.py @@ -0,0 +1,18 @@ +from crawlee.configuration import Configuration +from crawlee.crawlers import ParselCrawler +from crawlee.storage_clients import FileSystemStorageClient + +# Create a new instance of storage client. +storage_client = FileSystemStorageClient() + +# Create a configuration with custom settings. +configuration = Configuration( + storage_dir='./my_storage', + purge_on_start=False, +) + +# And pass them to the crawler. +crawler = ParselCrawler( + storage_client=storage_client, + configuration=configuration, +) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/storage_clients/memory_storage_client_basic_example.py b/website/versioned_docs/version-1.6/guides/code_examples/storage_clients/memory_storage_client_basic_example.py new file mode 100644 index 0000000000..fe79edc3f4 --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/storage_clients/memory_storage_client_basic_example.py @@ -0,0 +1,8 @@ +from crawlee.crawlers import ParselCrawler +from crawlee.storage_clients import MemoryStorageClient + +# Create a new instance of storage client. +storage_client = MemoryStorageClient() + +# And pass it to the crawler. +crawler = ParselCrawler(storage_client=storage_client) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/storage_clients/redis_storage_client_basic_example.py b/website/versioned_docs/version-1.6/guides/code_examples/storage_clients/redis_storage_client_basic_example.py new file mode 100644 index 0000000000..e787069d94 --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/storage_clients/redis_storage_client_basic_example.py @@ -0,0 +1,10 @@ +from crawlee.crawlers import ParselCrawler +from crawlee.storage_clients import RedisStorageClient + +# Create a new instance of storage client using connection string. +# 'redis://localhost:6379' is the just placeholder, replace it with your actual +# connection string. +storage_client = RedisStorageClient(connection_string='redis://localhost:6379') + +# And pass it to the crawler. +crawler = ParselCrawler(storage_client=storage_client) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/storage_clients/redis_storage_client_configuration_example.py b/website/versioned_docs/version-1.6/guides/code_examples/storage_clients/redis_storage_client_configuration_example.py new file mode 100644 index 0000000000..ad1863aa23 --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/storage_clients/redis_storage_client_configuration_example.py @@ -0,0 +1,27 @@ +from redis.asyncio import Redis + +from crawlee.configuration import Configuration +from crawlee.crawlers import ParselCrawler +from crawlee.storage_clients import RedisStorageClient + +# Create a new instance of storage client using a Redis client with custom settings. +# Replace host and port with your actual Redis server configuration. +# Other Redis client settings can be adjusted as needed. +storage_client = RedisStorageClient( + redis=Redis( + host='localhost', + port=6379, + retry_on_timeout=True, + socket_keepalive=True, + socket_connect_timeout=10, + ) +) + +# Create a configuration with custom settings. +configuration = Configuration(purge_on_start=False) + +# And pass them to the crawler. +crawler = ParselCrawler( + storage_client=storage_client, + configuration=configuration, +) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/storage_clients/registering_storage_clients_example.py b/website/versioned_docs/version-1.6/guides/code_examples/storage_clients/registering_storage_clients_example.py new file mode 100644 index 0000000000..b1107ea4ad --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/storage_clients/registering_storage_clients_example.py @@ -0,0 +1,29 @@ +import asyncio + +from crawlee import service_locator +from crawlee.crawlers import ParselCrawler +from crawlee.storage_clients import MemoryStorageClient +from crawlee.storages import Dataset + + +async def main() -> None: + # Create custom storage client, MemoryStorageClient for example. + storage_client = MemoryStorageClient() + + # Register it globally via the service locator. + service_locator.set_storage_client(storage_client) + + # Or pass it directly to the crawler, it will be registered globally + # to the service locator under the hood. + crawler = ParselCrawler(storage_client=storage_client) + + # Or just provide it when opening a storage (e.g. dataset), it will be used + # for this storage only, not globally. + dataset = await Dataset.open( + name='my-dataset', + storage_client=storage_client, + ) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/storage_clients/sql_storage_client_basic_example.py b/website/versioned_docs/version-1.6/guides/code_examples/storage_clients/sql_storage_client_basic_example.py new file mode 100644 index 0000000000..90c27f4039 --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/storage_clients/sql_storage_client_basic_example.py @@ -0,0 +1,12 @@ +from crawlee.crawlers import ParselCrawler +from crawlee.storage_clients import SqlStorageClient + + +async def main() -> None: + # Create a new instance of storage client. + # This will create an SQLite database file crawlee.db or created tables in your + # database if you pass `connection_string` or `engine` + # Use the context manager to ensure that connections are properly cleaned up. + async with SqlStorageClient() as storage_client: + # And pass it to the crawler. + crawler = ParselCrawler(storage_client=storage_client) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/storage_clients/sql_storage_client_configuration_example.py b/website/versioned_docs/version-1.6/guides/code_examples/storage_clients/sql_storage_client_configuration_example.py new file mode 100644 index 0000000000..257f392683 --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/storage_clients/sql_storage_client_configuration_example.py @@ -0,0 +1,33 @@ +from sqlalchemy.ext.asyncio import create_async_engine + +from crawlee.configuration import Configuration +from crawlee.crawlers import ParselCrawler +from crawlee.storage_clients import SqlStorageClient + + +async def main() -> None: + # Create a new instance of storage client. + # On first run, also creates tables in your PostgreSQL database. + # Use the context manager to ensure that connections are properly cleaned up. + async with SqlStorageClient( + # Create an `engine` with the desired configuration + engine=create_async_engine( + 'postgresql+asyncpg://myuser:mypassword@localhost:5432/postgres', + future=True, + pool_size=5, + max_overflow=10, + pool_recycle=3600, + pool_pre_ping=True, + echo=False, + ) + ) as storage_client: + # Create a configuration with custom settings. + configuration = Configuration( + purge_on_start=False, + ) + + # And pass them to the crawler. + crawler = ParselCrawler( + storage_client=storage_client, + configuration=configuration, + ) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/storages/cleaning_do_not_purge_example.py b/website/versioned_docs/version-1.6/guides/code_examples/storages/cleaning_do_not_purge_example.py new file mode 100644 index 0000000000..6514863555 --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/storages/cleaning_do_not_purge_example.py @@ -0,0 +1,23 @@ +import asyncio + +from crawlee.configuration import Configuration +from crawlee.crawlers import HttpCrawler, HttpCrawlingContext + + +async def main() -> None: + # Set the purge_on_start field to False to avoid purging the storage on start. + # highlight-next-line + configuration = Configuration(purge_on_start=False) + + # Pass the configuration to the crawler. + crawler = HttpCrawler(configuration=configuration) + + @crawler.router.default_handler + async def request_handler(context: HttpCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + await crawler.run(['https://crawlee.dev/']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/storages/cleaning_purge_explicitly_example.py b/website/versioned_docs/version-1.6/guides/code_examples/storages/cleaning_purge_explicitly_example.py new file mode 100644 index 0000000000..17911b79d7 --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/storages/cleaning_purge_explicitly_example.py @@ -0,0 +1,20 @@ +import asyncio + +from crawlee.storages import Dataset + + +async def main() -> None: + # Create storage client with configuration + dataset = await Dataset.open(name='my-dataset') + + # Purge the dataset explicitly - purging will remove all items from the dataset. + # But keeps the dataset itself and its metadata. + await dataset.purge() + + # Or you can drop the dataset completely, which will remove the dataset + # and all its items. + await dataset.drop() + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/storages/dataset_basic_example.py b/website/versioned_docs/version-1.6/guides/code_examples/storages/dataset_basic_example.py new file mode 100644 index 0000000000..03b7581f85 --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/storages/dataset_basic_example.py @@ -0,0 +1,26 @@ +import asyncio + +from crawlee.storages import Dataset + + +async def main() -> None: + # Open the dataset, if it does not exist, it will be created. + # Leave name empty to use the default dataset. + dataset = await Dataset.open(name='my-dataset') + + # Push a single row of data. + await dataset.push_data({'foo': 'bar'}) + + # Push multiple rows of data (anything JSON-serializable can be pushed). + await dataset.push_data([{'foo': 'bar2', 'col2': 'val2'}, {'col3': 123}]) + + # Fetch all data from the dataset. + data = await dataset.get_data() + # Do something with it... + + # Remove the dataset. + await dataset.drop() + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/storages/dataset_with_crawler_example.py b/website/versioned_docs/version-1.6/guides/code_examples/storages/dataset_with_crawler_example.py new file mode 100644 index 0000000000..7e40824166 --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/storages/dataset_with_crawler_example.py @@ -0,0 +1,32 @@ +import asyncio + +from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext + + +async def main() -> None: + # Create a new crawler (it can be any subclass of BasicCrawler). + crawler = BeautifulSoupCrawler() + + # Define the default request handler, which will be called for every request. + @crawler.router.default_handler + async def request_handler(context: BeautifulSoupCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # Extract data from the page. + data = { + 'url': context.request.url, + 'title': context.soup.title.string if context.soup.title else None, + } + + # Push the extracted data to the (default) dataset. + await context.push_data(data) + + # Run the crawler with the initial URLs. + await crawler.run(['https://crawlee.dev']) + + # Export the dataset to a file. + await crawler.export_data(path='dataset.csv') + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/storages/dataset_with_crawler_explicit_example.py b/website/versioned_docs/version-1.6/guides/code_examples/storages/dataset_with_crawler_explicit_example.py new file mode 100644 index 0000000000..2b19c86994 --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/storages/dataset_with_crawler_explicit_example.py @@ -0,0 +1,37 @@ +import asyncio + +from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext +from crawlee.storages import Dataset + + +async def main() -> None: + # Open the dataset, if it does not exist, it will be created. + # Leave name empty to use the default dataset. + dataset = await Dataset.open(name='my-dataset') + + # Create a new crawler (it can be any subclass of BasicCrawler). + crawler = BeautifulSoupCrawler() + + # Define the default request handler, which will be called for every request. + @crawler.router.default_handler + async def request_handler(context: BeautifulSoupCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # Extract data from the page. + data = { + 'url': context.request.url, + 'title': context.soup.title.string if context.soup.title else None, + } + + # Push the extracted data to the dataset. + await dataset.push_data(data) + + # Run the crawler with the initial URLs. + await crawler.run(['https://crawlee.dev']) + + # Export the dataset to the key-value store. + await dataset.export_to(key='dataset', content_type='csv') + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/storages/helper_add_requests_example.py b/website/versioned_docs/version-1.6/guides/code_examples/storages/helper_add_requests_example.py new file mode 100644 index 0000000000..15104cf6fc --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/storages/helper_add_requests_example.py @@ -0,0 +1,19 @@ +import asyncio + +from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext + + +async def main() -> None: + crawler = BeautifulSoupCrawler() + + @crawler.router.default_handler + async def request_handler(context: BeautifulSoupCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + # highlight-next-line + await context.add_requests(['https://apify.com/']) + + await crawler.run(['https://crawlee.dev/']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/storages/helper_enqueue_links_example.py b/website/versioned_docs/version-1.6/guides/code_examples/storages/helper_enqueue_links_example.py new file mode 100644 index 0000000000..6c7392bc3b --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/storages/helper_enqueue_links_example.py @@ -0,0 +1,19 @@ +import asyncio + +from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext + + +async def main() -> None: + crawler = BeautifulSoupCrawler() + + @crawler.router.default_handler + async def request_handler(context: BeautifulSoupCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + # highlight-next-line + await context.enqueue_links() + + await crawler.run(['https://crawlee.dev/']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/storages/kvs_basic_example.py b/website/versioned_docs/version-1.6/guides/code_examples/storages/kvs_basic_example.py new file mode 100644 index 0000000000..9cc66c59a5 --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/storages/kvs_basic_example.py @@ -0,0 +1,26 @@ +import asyncio + +from crawlee.storages import KeyValueStore + + +async def main() -> None: + # Open the key-value store, if it does not exist, it will be created. + # Leave name empty to use the default KVS. + kvs = await KeyValueStore.open(name='my-key-value-store') + + # Set a value associated with 'some-key'. + await kvs.set_value(key='some-key', value={'foo': 'bar'}) + + # Get the value associated with 'some-key'. + value = kvs.get_value('some-key') + # Do something with it... + + # Delete the value associated with 'some-key' by setting it to None. + await kvs.set_value(key='some-key', value=None) + + # Remove the key-value store. + await kvs.drop() + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/storages/kvs_with_crawler_example.py b/website/versioned_docs/version-1.6/guides/code_examples/storages/kvs_with_crawler_example.py new file mode 100644 index 0000000000..732ee41f76 --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/storages/kvs_with_crawler_example.py @@ -0,0 +1,35 @@ +import asyncio + +from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext + + +async def main() -> None: + # Create a new Playwright crawler. + crawler = PlaywrightCrawler() + + # Define the default request handler, which will be called for every request. + @crawler.router.default_handler + async def request_handler(context: PlaywrightCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # Capture the screenshot of the page using Playwright's API. + screenshot = await context.page.screenshot() + name = context.request.url.split('/')[-1] + + # Get the key-value store from the context. # If it does not exist, + # it will be created. Leave name empty to use the default KVS. + kvs = await context.get_key_value_store() + + # Store the screenshot in the key-value store. + await kvs.set_value( + key=f'screenshot-{name}', + value=screenshot, + content_type='image/png', + ) + + # Run the crawler with the initial URLs. + await crawler.run(['https://crawlee.dev']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/storages/kvs_with_crawler_explicit_example.py b/website/versioned_docs/version-1.6/guides/code_examples/storages/kvs_with_crawler_explicit_example.py new file mode 100644 index 0000000000..4c965457c3 --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/storages/kvs_with_crawler_explicit_example.py @@ -0,0 +1,36 @@ +import asyncio + +from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext +from crawlee.storages import KeyValueStore + + +async def main() -> None: + # Open the key-value store, if it does not exist, it will be created. + # Leave name empty to use the default KVS. + kvs = await KeyValueStore.open(name='my-key-value-store') + + # Create a new Playwright crawler. + crawler = PlaywrightCrawler() + + # Define the default request handler, which will be called for every request. + @crawler.router.default_handler + async def request_handler(context: PlaywrightCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # Capture the screenshot of the page using Playwright's API. + screenshot = await context.page.screenshot() + name = context.request.url.split('/')[-1] + + # Store the screenshot in the key-value store. + await kvs.set_value( + key=f'screenshot-{name}', + value=screenshot, + content_type='image/png', + ) + + # Run the crawler with the initial URLs. + await crawler.run(['https://crawlee.dev']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/storages/opening.py b/website/versioned_docs/version-1.6/guides/code_examples/storages/opening.py new file mode 100644 index 0000000000..3ce77d7a67 --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/storages/opening.py @@ -0,0 +1,18 @@ +import asyncio + +from crawlee.storages import Dataset + + +async def main() -> None: + # Named storage (persists across runs) + dataset_named = await Dataset.open(name='my-persistent-dataset') + + # Unnamed storage with alias (purged on start) + dataset_unnamed = await Dataset.open(alias='temporary-results') + + # Default unnamed storage (purged on start) + dataset_default = await Dataset.open() + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/storages/rq_basic_example.py b/website/versioned_docs/version-1.6/guides/code_examples/storages/rq_basic_example.py new file mode 100644 index 0000000000..388c184fc6 --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/storages/rq_basic_example.py @@ -0,0 +1,31 @@ +import asyncio + +from crawlee.storages import RequestQueue + + +async def main() -> None: + # Open the request queue, if it does not exist, it will be created. + # Leave name empty to use the default request queue. + request_queue = await RequestQueue.open(name='my-request-queue') + + # Add a single request. + await request_queue.add_request('https://apify.com/') + + # Add multiple requests as a batch. + await request_queue.add_requests( + ['https://crawlee.dev/', 'https://crawlee.dev/python/'] + ) + + # Fetch and process requests from the queue. + while request := await request_queue.fetch_next_request(): + # Do something with it... + + # And mark it as handled. + await request_queue.mark_request_as_handled(request) + + # Remove the request queue. + await request_queue.drop() + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/storages/rq_with_crawler_example.py b/website/versioned_docs/version-1.6/guides/code_examples/storages/rq_with_crawler_example.py new file mode 100644 index 0000000000..ce6a34cb59 --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/storages/rq_with_crawler_example.py @@ -0,0 +1,27 @@ +import asyncio + +from crawlee.crawlers import HttpCrawler, HttpCrawlingContext + + +async def main() -> None: + # Create a new crawler (it can be any subclass of BasicCrawler). Request queue is + # a default request manager, it will be opened, and fully managed if not specified. + crawler = HttpCrawler() + + # Define the default request handler, which will be called for every request. + @crawler.router.default_handler + async def request_handler(context: HttpCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # Use context's add_requests method helper to add new requests from the handler. + await context.add_requests(['https://crawlee.dev/python/']) + + # Use crawler's add_requests method helper to add new requests. + await crawler.add_requests(['https://apify.com/']) + + # Run the crawler. You can optionally pass the list of initial requests. + await crawler.run(['https://crawlee.dev/']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/storages/rq_with_crawler_explicit_example.py b/website/versioned_docs/version-1.6/guides/code_examples/storages/rq_with_crawler_explicit_example.py new file mode 100644 index 0000000000..aac7b0bcb8 --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/storages/rq_with_crawler_explicit_example.py @@ -0,0 +1,29 @@ +import asyncio + +from crawlee.crawlers import HttpCrawler, HttpCrawlingContext +from crawlee.storages import RequestQueue + + +async def main() -> None: + # Open the request queue, if it does not exist, it will be created. + # Leave name empty to use the default request queue. + request_queue = await RequestQueue.open(name='my-request-queue') + + # Interact with the request queue directly, e.g. add a batch of requests. + await request_queue.add_requests(['https://apify.com/', 'https://crawlee.dev/']) + + # Create a new crawler (it can be any subclass of BasicCrawler) and pass the request + # queue as request manager to it. It will be managed by the crawler. + crawler = HttpCrawler(request_manager=request_queue) + + # Define the default request handler, which will be called for every request. + @crawler.router.default_handler + async def request_handler(context: HttpCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # And execute the crawler. + await crawler.run() + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/trace_and_monitor_crawlers/instrument_crawler.py b/website/versioned_docs/version-1.6/guides/code_examples/trace_and_monitor_crawlers/instrument_crawler.py new file mode 100644 index 0000000000..f66f5c7698 --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/trace_and_monitor_crawlers/instrument_crawler.py @@ -0,0 +1,57 @@ +import asyncio + +from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter +from opentelemetry.sdk.resources import Resource +from opentelemetry.sdk.trace import TracerProvider +from opentelemetry.sdk.trace.export import SimpleSpanProcessor +from opentelemetry.trace import set_tracer_provider + +from crawlee.crawlers import BasicCrawlingContext, ParselCrawler, ParselCrawlingContext +from crawlee.otel import CrawlerInstrumentor +from crawlee.storages import Dataset, KeyValueStore, RequestQueue + + +def instrument_crawler() -> None: + """Add instrumentation to the crawler.""" + resource = Resource.create( + { + 'service.name': 'ExampleCrawler', + 'service.version': '1.0.0', + 'environment': 'development', + } + ) + + # Set up the OpenTelemetry tracer provider and exporter + provider = TracerProvider(resource=resource) + otlp_exporter = OTLPSpanExporter(endpoint='localhost:4317', insecure=True) + provider.add_span_processor(SimpleSpanProcessor(otlp_exporter)) + set_tracer_provider(provider) + # Instrument the crawler with OpenTelemetry + CrawlerInstrumentor( + instrument_classes=[RequestQueue, KeyValueStore, Dataset] + ).instrument() + + +async def main() -> None: + """Run the crawler.""" + instrument_crawler() + + crawler = ParselCrawler(max_requests_per_crawl=100) + kvs = await KeyValueStore.open() + + @crawler.pre_navigation_hook + async def pre_nav_hook(_: BasicCrawlingContext) -> None: + # Simulate some pre-navigation processing + await asyncio.sleep(0.01) + + @crawler.router.default_handler + async def handler(context: ParselCrawlingContext) -> None: + await context.push_data({'url': context.request.url}) + await kvs.set_value(key='url', value=context.request.url) + await context.enqueue_links() + + await crawler.run(['https://crawlee.dev/']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/guides/crawler_login.mdx b/website/versioned_docs/version-1.6/guides/crawler_login.mdx new file mode 100644 index 0000000000..fc02014dde --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/crawler_login.mdx @@ -0,0 +1,41 @@ +--- +id: logging-in-with-a-crawler +title: Logging in with a crawler +description: How to log in to websites with Crawlee. +--- + +import ApiLink from '@site/src/components/ApiLink'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; + +import PlaywrightLogin from '!!raw-loader!roa-loader!./code_examples/login_crawler/playwright_login.py'; +import HttpLogin from '!!raw-loader!roa-loader!./code_examples/login_crawler/http_login.py'; + +Many websites require authentication to access their content. This guide demonstrates how to implement login functionality using both `PlaywrightCrawler` and `HttpCrawler`. + +## Session management for authentication + +When implementing authentication, you'll typically want to maintain the same `Session` throughout your crawl to preserve login state. This requires proper configuration of the `SessionPool`. For more details, see our [session management guide](./session-management). + +If your use case requires multiple authenticated sessions with different credentials, you can: +- Use the `new_session_function` parameter in `SessionPool` to customize session creation. +- Specify the `session_id` parameter in `Request` to bind specific requests to particular sessions. + +For this guide, we'll use [demoqa.com](https://demoqa.com/login), a testing site designed for automation practice that provides a login form and protected content. + +## Login with Playwright crawler + +The following example demonstrates how to authenticate on a website using `PlaywrightCrawler`, which provides browser automation capabilities for filling out logging forms. + + + {PlaywrightLogin} + + +## Login with HTTP crawler + +You can also use `HttpCrawler` (or its more specific variants like `ParselCrawler` or `BeautifulSoupCrawler`) to authenticate by sending a POST `Request` with your credentials directly to the authentication endpoint. + +HTTP-based authentication often varies significantly between websites. Using browser [DevTools](https://developer.chrome.com/docs/devtools/overview) to analyze the `Network` tab during manual login can help you understand the specific authentication flow, required headers, and body parameters for your target website. + + + {HttpLogin} + diff --git a/website/versioned_docs/version-1.6/guides/creating_web_archive.mdx b/website/versioned_docs/version-1.6/guides/creating_web_archive.mdx new file mode 100644 index 0000000000..57f8bd3d49 --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/creating_web_archive.mdx @@ -0,0 +1,89 @@ +--- +id: creating-web-archive +title: Creating web archive +description: How to create a Web ARChive (WARC) with Crawlee +--- + +import ApiLink from '@site/src/components/ApiLink'; +import CodeBlock from '@theme/CodeBlock'; + +import PlaywrightCrawlerRecordThroughProxy from '!!raw-loader!./code_examples/creating_web_archive/simple_pw_through_proxy_pywb_server.py'; +import ParselCrawlerRecordManual from '!!raw-loader!./code_examples/creating_web_archive/manual_archiving_parsel_crawler.py'; +import PlaywrightCrawlerRecordManual from '!!raw-loader!./code_examples/creating_web_archive/manual_archiving_playwright_crawler.py'; + +Archiving webpages is one of the tasks that a web crawler can be used for. There are various use cases, such as archiving for future reference, speeding up web crawler development, creating top-level regression tests for web crawlers and so on. + +There are various existing libraries of web archives with massive amount of data stored during their years of existence, for example [Wayback Machine](https://web.archive.org/) or [Common Crawl](https://commoncrawl.org/). There are also dedicated tools for archiving web pages, to name some: simple browser extensions such as [Archive Webpage](https://archiveweb.page/), open source tools such as [pywb](https://pypi.org/project/pywb/) or [warcio](https://pypi.org/project/warcio/), or even web crawlers specialized in archiving such as [Browsertrix](https://webrecorder.net/browsertrix/). + +The common file format used for archiving is [WARC](https://www.iso.org/standard/68004.html). Crawlee does not offer any out-of-the-box functionality to create WARC files, but in this guide, we will show examples of approaches that can be easily used in your use case to create WARC files with Crawlee. + +## Crawling through proxy recording server + +This approach can be especially attractive as it does not require almost any code change to the crawler itself and the correct WARC creation is done by code from well maintained [pywb](https://pypi.org/project/pywb/) package. The trick is to run a properly configured [wayback proxy server](https://pywb.readthedocs.io/en/latest/manual/usage.html#using-pywb-recorder), use it as a proxy for the crawler and record any traffic. Another advantage of this approach is that it is language agnostic. This way, you can record both your Python-based crawler and your JavaScript-based crawler. This is very straightforward and a good place to start. + +This approach expects that you have already created your crawler, and that you just want to archive all the pages it is visiting during its crawl. + +Install [pywb](https://pypi.org/project/pywb/) which will allow you to use `wb-manager` and `wayback` commands. +Create a new collection that will be used for this archiving session and start the wayback server: +```bash +wb-manager init example-collection +wayback --record --live -a --auto-interval 10 --proxy example-collection --proxy-record +``` +Instead of passing many configuration arguments to `wayback` command, you can configure the server by adding configuration options to `config.yaml`. See the details in the [documentation](https://pywb.readthedocs.io/en/latest/manual/configuring.html#configuring-the-web-archive). + +### Configure the crawler + +Now you should use this locally hosted server as a proxy in your crawler. There are two more steps before starting the crawler: + - Make the crawler use the proxy server. + - Deal with the [pywb Certificate Authority](https://pywb.readthedocs.io/en/latest/manual/configuring.html#https-proxy-and-pywb-certificate-authority). + +For example, in `PlaywrightCrawler`, this is the simplest setup, which takes the shortcut and ignores the CA-related errors: + + + {PlaywrightCrawlerRecordThroughProxy} + + +After you run the crawler you will be able to see the archived data in the wayback collection directory for example `.../collections/example-collection/archive`. You can then access the recorded pages directly in the proxy recording server or use it with any other WARC-compatible tool. + +## Manual WARC creation + +A different approach is to create WARC files manually in the crawler, which gives you full control over the WARC files. This is way more complex and low-level approach as you have to ensure that all the relevant data is collected, and correctly stored and that the archiving functions are called at the right time. This is by no means a trivial task and the example archiving functions below are just the most simple examples that will be insufficient for many real-world use cases. You will need to extend and improve them to properly fit your specific needs. + +### Simple crawlers + +With non-browser crawlers such as `ParselCrawler` you will not be able to create high fidelity archive of the page as you will be missing all the JavaScript dynamic content. However, you can still create a WARC file with the HTML content of the page, which can be sufficient for some use cases. Let's take a look at the example below: + + {ParselCrawlerRecordManual} + + +The example above is calling an archiving function on each request using the `request_handler`. + +### Browser-based crawlers + +With browser crawlers such as `PlaywrightCrawler` you should be able to create high fidelity archive of a web page. Let's take a look at the example below: + + + {PlaywrightCrawlerRecordManual} + + +The example above is adding an archiving callback on each response in the pre_navigation `archiving_hook`. This ensures that additional resources requested by the browser are also archived. + +## Using the archived data + +In the following section, we will describe an example use case how you can use the recorded WARC files to speed up the development of your web crawler. The idea is to use the archived data as a source of responses for your crawler so that you can test it against the real data without having to crawl the web again. + +It is assumed that you already have the WARC files. If not, please read the previous sections on how to create them first. + +Let's use pywb again. This time we will not use it as a recording server, but as a proxy server that will serve the previously archived pages to your crawler in development. + +```bash +wb-manager init example-collection +wb-manager add example-collection /your_path_to_warc_file/example.warc.gz +wayback --proxy example-collection +``` + +Previous commands start the wayback server that allows crawler requests to be served from the archived pages in the `example-collection` instead of sending requests to the real website. This is again [proxy mode of the wayback server](https://pywb.readthedocs.io/en/latest/manual/usage.html#http-s-proxy-mode-access), but without recording capability. Now you need to [configure your crawler](#configure-the-crawler) to use this proxy server, which was already described above. Once everything is finished, you can just run your crawler, and it will crawl the offline archived version of the website from your WARC file. + +You can also manually browse the archived pages in the wayback server by going to the locally hosted server and entering the collection and URL of the archived page, for example: `http://localhost:8080/example-collection/https:/crawlee.dev/`. The wayback server will serve the page from the WARC file if it exists, or it will return a 404 error if it does not. For more detail about the server please refer to the [pywb documentation](https://pywb.readthedocs.io/en/latest/manual/usage.html#getting-started). + +If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/crawlee-python) or join our [Discord](https://discord.com/invite/jyEM2PRvMU) community. diff --git a/website/versioned_docs/version-1.6/guides/error_handling.mdx b/website/versioned_docs/version-1.6/guides/error_handling.mdx new file mode 100644 index 0000000000..abd1b33058 --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/error_handling.mdx @@ -0,0 +1,44 @@ +--- +id: error-handling +title: Error handling +description: How to handle errors that occur during web crawling. +--- + +import ApiLink from '@site/src/components/ApiLink'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; + +import HandleProxyError from '!!raw-loader!roa-loader!./code_examples/error_handling/handle_proxy_error.py'; +import ChangeHandleErrorStatus from '!!raw-loader!roa-loader!./code_examples/error_handling/change_handle_error_status.py'; +import DisableRetry from '!!raw-loader!roa-loader!./code_examples/error_handling/disable_retry.py'; + +This guide demonstrates techniques for handling common errors encountered during web crawling operations. + +## Handling proxy errors + +Low-quality proxies can cause problems even with high settings for `max_request_retries` and `max_session_rotations` in `BasicCrawlerOptions`. If you can't get data because of proxy errors, you might want to try again. You can do this using `failed_request_handler`: + + + {HandleProxyError} + + +You can use this same approach when testing different proxy providers. To better manage this process, you can count proxy errors and [stop the crawler](../examples/crawler-stop) if you get too many. + +## Changing how error status codes are handled + +By default, when `Sessions` get status codes like [401](https://developer.mozilla.org/en-US/docs/Web/HTTP/Reference/Status/401), [403](https://developer.mozilla.org/en-US/docs/Web/HTTP/Reference/Status/403), or [429](https://developer.mozilla.org/en-US/docs/Web/HTTP/Reference/Status/429), Crawlee marks the `Session` as `retire` and switches to a new one. This might not be what you want, especially when working with [authentication](./logging-in-with-a-crawler). You can learn more in the [Session management guide](./session-management). + +Here's an example of how to change this behavior: + + + {ChangeHandleErrorStatus} + + +## Turning off retries for non-network errors + +Sometimes you might get unexpected errors when parsing data, like when a website has an unusual structure. Crawlee normally tries again based on your `max_request_retries` setting, but sometimes you don't want that. + +Here's how to turn off retries for non-network errors using `error_handler`, which runs before Crawlee tries again: + + + {DisableRetry} + diff --git a/website/versioned_docs/version-1.6/guides/http_clients.mdx b/website/versioned_docs/version-1.6/guides/http_clients.mdx new file mode 100644 index 0000000000..28f3b70202 --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/http_clients.mdx @@ -0,0 +1,118 @@ +--- +id: http-clients +title: HTTP clients +description: Learn about Crawlee's HTTP client architecture, how to switch between different implementations, and create custom HTTP clients for specialized web scraping needs. +--- + +import ApiLink from '@site/src/components/ApiLink'; +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; + +import ParselHttpxExample from '!!raw-loader!roa-loader!./code_examples/http_clients/parsel_httpx_example.py'; +import ParselCurlImpersonateExample from '!!raw-loader!roa-loader!./code_examples/http_clients/parsel_curl_impersonate_example.py'; +import ParselImpitExample from '!!raw-loader!roa-loader!./code_examples/http_clients/parsel_impit_example.py'; + +HTTP clients are utilized by HTTP-based crawlers (e.g., `ParselCrawler` and `BeautifulSoupCrawler`) to communicate with web servers. They use external HTTP libraries for communication rather than a browser. Examples of such libraries include [httpx](https://pypi.org/project/httpx/), [aiohttp](https://pypi.org/project/aiohttp/), [curl-cffi](https://pypi.org/project/curl-cffi/), and [impit](https://apify.github.io/impit/). After retrieving page content, an HTML parsing library is typically used to facilitate data extraction. Examples of such libraries include [beautifulsoup](https://pypi.org/project/beautifulsoup4/), [parsel](https://pypi.org/project/parsel/), [selectolax](https://pypi.org/project/selectolax/), and [pyquery](https://pypi.org/project/pyquery/). These crawlers are faster than browser-based crawlers but cannot execute client-side JavaScript. + +```mermaid +--- +config: + class: + hideEmptyMembersBox: true +--- + +classDiagram + +%% ======================== +%% Abstract classes +%% ======================== + +class HttpClient { + <> +} + +%% ======================== +%% Specific classes +%% ======================== + +class ImpitHttpClient + +class HttpxHttpClient + +class CurlImpersonateHttpClient + +%% ======================== +%% Inheritance arrows +%% ======================== + +HttpClient --|> ImpitHttpClient +HttpClient --|> HttpxHttpClient +HttpClient --|> CurlImpersonateHttpClient +``` + +## Switching between HTTP clients + +Crawlee currently provides three main HTTP clients: `ImpitHttpClient`, which uses the `impit` library, `HttpxHttpClient`, which uses the `httpx` library with `browserforge` for custom HTTP headers and fingerprints, and `CurlImpersonateHttpClient`, which uses the `curl-cffi` library. You can switch between them by setting the `http_client` parameter when initializing a crawler class. The default HTTP client is `ImpitHttpClient`. For more details on anti-blocking features, see our [avoid getting blocked guide](./avoid-blocking). + +Below are examples of how to configure the HTTP client for the `ParselCrawler`: + + + + + {ParselHttpxExample} + + + + + {ParselCurlImpersonateExample} + + + + + {ParselImpitExample} + + + + +## Installation requirements + +Since `ImpitHttpClient` is the default HTTP client, it's included with the base Crawlee installation and requires no additional packages. + +For `CurlImpersonateHttpClient`, you need to install Crawlee with the `curl-impersonate` extra: + +```sh +python -m pip install 'crawlee[curl-impersonate]' +``` + +For `HttpxHttpClient`, you need to install Crawlee with the `httpx` extra: + +```sh +python -m pip install 'crawlee[httpx]' +``` + +Alternatively, you can install all available extras to get access to all HTTP clients and features: + +```sh +python -m pip install 'crawlee[all]' +``` + +## Creating custom HTTP clients + +Crawlee provides an abstract base class, `HttpClient`, which defines the interface that all HTTP clients must implement. This allows you to create custom HTTP clients tailored to your specific requirements. + +HTTP clients are responsible for several key operations: + +- sending HTTP requests and receiving responses, +- managing cookies and sessions, +- handling headers and authentication, +- managing proxy configurations, +- connection pooling with timeout management. + +To create a custom HTTP client, you need to inherit from the `HttpClient` base class and implement all required abstract methods. Your implementation must be async-compatible and include proper cleanup and resource management to work seamlessly with Crawlee's concurrent processing model. + +## Conclusion + +This guide introduced you to the HTTP clients available in Crawlee and demonstrated how to switch between them, including their installation requirements and usage examples. You also learned about the responsibilities of HTTP clients and how to implement your own custom HTTP client by inheriting from the `HttpClient` base class. + +If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/crawlee-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping! diff --git a/website/versioned_docs/version-1.6/guides/http_crawlers.mdx b/website/versioned_docs/version-1.6/guides/http_crawlers.mdx new file mode 100644 index 0000000000..366b36127c --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/http_crawlers.mdx @@ -0,0 +1,193 @@ +--- +id: http-crawlers +title: HTTP crawlers +description: Learn about Crawlee's HTTP crawlers including BeautifulSoup, Parsel, and raw HTTP crawlers for efficient server-rendered content extraction without JavaScript execution. +--- + +import ApiLink from '@site/src/components/ApiLink'; +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; +import CodeBlock from '@theme/CodeBlock'; + +import BeautifulSoupExample from '!!raw-loader!roa-loader!./code_examples/http_crawlers/beautifulsoup_example.py'; +import ParselExample from '!!raw-loader!roa-loader!./code_examples/http_crawlers/parsel_example.py'; +import HttpExample from '!!raw-loader!roa-loader!./code_examples/http_crawlers/http_example.py'; + +import LxmlParser from '!!raw-loader!roa-loader!./code_examples/http_crawlers/lxml_parser.py'; +import LxmlSaxoncheParser from '!!raw-loader!roa-loader!./code_examples/http_crawlers/lxml_saxonche_parser.py'; +import LexborParser from '!!raw-loader!roa-loader!./code_examples/http_crawlers/lexbor_parser.py'; +import PyqueryParser from '!!raw-loader!roa-loader!./code_examples/http_crawlers/pyquery_parser.py'; +import ScraplingParser from '!!raw-loader!roa-loader!./code_examples/http_crawlers/scrapling_parser.py'; + +import SelectolaxParserSource from '!!raw-loader!./code_examples/http_crawlers/selectolax_parser.py'; +import SelectolaxContextSource from '!!raw-loader!./code_examples/http_crawlers/selectolax_context.py'; +import SelectolaxCrawlerSource from '!!raw-loader!./code_examples/http_crawlers/selectolax_crawler.py'; +import SelectolaxCrawlerRunSource from '!!raw-loader!./code_examples/http_crawlers/selectolax_crawler_run.py'; +import AdaptiveCrawlerRunSource from '!!raw-loader!./code_examples/http_crawlers/selectolax_adaptive_run.py'; + +HTTP crawlers are ideal for extracting data from server-rendered websites that don't require JavaScript execution. These crawlers make requests via HTTP clients to fetch HTML content and then parse it using various parsing libraries. For client-side rendered content, where you need to execute JavaScript consider using [Playwright crawler](https://crawlee.dev/python/docs/guides/playwright-crawler) instead. + +## Overview + +All HTTP crawlers share a common architecture built around the `AbstractHttpCrawler` base class. The main differences lie in the parsing strategy and the context provided to request handlers. There are `BeautifulSoupCrawler`, `ParselCrawler`, and `HttpCrawler`. It can also be extended to create custom crawlers with specialized parsing requirements. They use HTTP clients to fetch page content and parsing libraries to extract data from the HTML, check out the [HTTP clients guide](./http-clients) to learn about the HTTP clients used by these crawlers, how to switch between them, and how to create custom HTTP clients tailored to your specific requirements. + +```mermaid +--- +config: + class: + hideEmptyMembersBox: true +--- + +classDiagram + +%% ======================== +%% Abstract classes +%% ======================== + +class BasicCrawler { + <> +} + +class AbstractHttpCrawler { + <> +} + +%% ======================== +%% Specific classes +%% ======================== + +class HttpCrawler + +class ParselCrawler + +class BeautifulSoupCrawler + +%% ======================== +%% Inheritance arrows +%% ======================== + +BasicCrawler --|> AbstractHttpCrawler +AbstractHttpCrawler --|> HttpCrawler +AbstractHttpCrawler --|> ParselCrawler +AbstractHttpCrawler --|> BeautifulSoupCrawler +``` + +## BeautifulSoupCrawler + +The `BeautifulSoupCrawler` uses the [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/) library for HTML parsing. It provides fault-tolerant parsing that handles malformed HTML, automatic character encoding detection, and supports CSS selectors, tag navigation, and custom search functions. Use this crawler when working with imperfect HTML structures, when you prefer BeautifulSoup's intuitive API, or when prototyping web scraping solutions. + + + {BeautifulSoupExample} + + +## ParselCrawler + +The `ParselCrawler` uses the [Parsel](https://parsel.readthedocs.io/) library, which provides XPath 1.0 and CSS selector support built on `lxml` for high performance. It includes built-in regex support for pattern matching, proper XML namespace handling, and offers better performance than BeautifulSoup while maintaining a clean API. Use this crawler when you need XPath functionality, require high-performance parsing, or need to extract data using regular expressions. + + + {ParselExample} + + +## HttpCrawler + +The `HttpCrawler` provides direct access to HTTP response body and headers without automatic parsing, offering maximum performance with no parsing overhead. It supports any content type (JSON, XML, binary) and allows complete control over response processing, including memory-efficient handling of large responses. Use this crawler when working with non-HTML content, requiring maximum performance, implementing custom parsing logic, or needing access to raw response data. + + + {HttpExample} + + +### Using custom parsers + +Since `HttpCrawler` provides raw HTTP responses, you can integrate any parsing library. Note that helpers like `enqueue_links` and `extract_links` are not available with this approach. + +The following examples demonstrate how to integrate with several popular parsing libraries, including [lxml](https://lxml.de/) (high-performance parsing with XPath 1.0), [lxml with SaxonC-HE](https://pypi.org/project/saxonche/) (XPath 3.1 support), [selectolax](https://github.com/rushter/selectolax) (high-speed CSS selectors), [PyQuery](https://pyquery.readthedocs.io/) (jQuery-like syntax), and [scrapling](https://github.com/D4Vinci/Scrapling) (a Scrapy/Parsel-style API offering BeautifulSoup-like methods). + + + + + {LxmlParser} + + + + + {LxmlSaxoncheParser} + + + + + {LexborParser} + + + + + {PyqueryParser} + + + + + {ScraplingParser} + + + + +## Custom HTTP crawler + +While the built-in crawlers cover most use cases, you might need a custom HTTP crawler for specialized parsing requirements. To create a custom HTTP crawler, inherit directly from `AbstractHttpCrawler`. This approach requires implementing: + +1. **Custom parser class**: Inherit from `AbstractHttpParser`. +2. **Custom context class**: Define what data and helpers are available to handlers. +3. **Custom crawler class**: Tie everything together. + +This approach is recommended when you need tight integration between parsing and the crawling context, or when you're building a reusable crawler for a specific technology or format. + +The following example demonstrates how to create a custom crawler using `selectolax` with the `Lexbor` engine. + +### Parser implementation + +The parser converts HTTP responses into a parsed document and provides methods for element selection. Implement `AbstractHttpParser` using `selectolax` with required methods for parsing and querying: + + + {SelectolaxParserSource} + + +This is enough to use your parser with `AbstractHttpCrawler.create_parsed_http_crawler_class` factory method. For more control, continue with custom context and crawler classes below. + +### Crawling context definition (optional) + +The crawling context is passed to request handlers and provides access to the parsed content. Extend `ParsedHttpCrawlingContext` to define the interface your handlers will work with. Here you can implement additional helpers for the crawler context. + + + {SelectolaxContextSource} + + +### Crawler composition + +The crawler class connects the parser and context. Extend `AbstractHttpCrawler` and configure the context pipeline to use your custom components: + + + {SelectolaxCrawlerSource} + + +### Crawler usage + +The custom crawler works like any built-in crawler. Request handlers receive your custom context with full access to framework helpers like `enqueue_links`. Additionally, the custom parser can be used with `AdaptivePlaywrightCrawler` for adaptive crawling: + + + + + {SelectolaxCrawlerRunSource} + + + + + {AdaptiveCrawlerRunSource} + + + + +## Conclusion + +This guide provided a comprehensive overview of HTTP crawlers in Crawlee. You learned about the three main crawler types - `BeautifulSoupCrawler` for fault-tolerant HTML parsing, `ParselCrawler` for high-performance extraction with XPath and CSS selectors, and `HttpCrawler` for raw response processing. You also discovered how to integrate third-party parsing libraries with `HttpCrawler` and how to create fully custom crawlers using `AbstractHttpCrawler` for specialized parsing requirements. + +If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/crawlee-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping! diff --git a/website/versioned_docs/version-1.6/guides/playwright_crawler.mdx b/website/versioned_docs/version-1.6/guides/playwright_crawler.mdx new file mode 100644 index 0000000000..17eebcc465 --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/playwright_crawler.mdx @@ -0,0 +1,78 @@ +--- +id: playwright-crawler +title: Playwright crawler +description: Learn how to use PlaywrightCrawler for browser-based web scraping. +--- + +import ApiLink from '@site/src/components/ApiLink'; +import CodeBlock from '@theme/CodeBlock'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; + +import MultipleLaunchExample from '!!raw-loader!roa-loader!./code_examples/playwright_crawler/multiple_launch_example.py'; +import BrowserConfigurationExample from '!!raw-loader!roa-loader!./code_examples/playwright_crawler/browser_configuration_example.py'; +import NavigationHooksExample from '!!raw-loader!roa-loader!./code_examples/playwright_crawler/navigation_hooks_example.py'; +import BrowserPoolPageHooksExample from '!!raw-loader!roa-loader!./code_examples/playwright_crawler/browser_pool_page_hooks_example.py'; +import PluginBrowserConfigExample from '!!raw-loader!./code_examples/playwright_crawler/plugin_browser_configuration_example.py'; + +A `PlaywrightCrawler` is a browser-based crawler. In contrast to HTTP-based crawlers like `ParselCrawler` or `BeautifulSoupCrawler`, it uses a real browser to render pages and extract data. It is built on top of the [Playwright](https://playwright.dev/python/) browser automation library. While browser-based crawlers are typically slower and less efficient than HTTP-based crawlers, they can handle dynamic, client-side rendered sites that standard HTTP-based crawlers cannot manage. + +## When to use Playwright crawler + +Use `PlaywrightCrawler` in scenarios that require full browser capabilities, such as: + +- **Dynamic content rendering**: Required when pages rely on heavy JavaScript to load or modify content in the browser. +- **Anti-scraping protection**: Helpful for sites using JavaScript-based security or advanced anti-automation measures. +- **Complex cookie management**: Necessary for sites with session or cookie requirements that standard HTTP-based crawlers cannot handle easily. + +If [HTTP-based crawlers](https://crawlee.dev/python/docs/guides/http-crawlers) are insufficient, `PlaywrightCrawler` can address these challenges. See a [basic example](../examples/playwright-crawler) for a typical usage demonstration. + +## Advanced configuration + +The `PlaywrightCrawler` uses other Crawlee components under the hood, notably `BrowserPool` and `PlaywrightBrowserPlugin`. These components let you to configure the browser and context settings, launch multiple browsers, and apply pre-navigation hooks. You can create your own instances of these components and pass them to the `PlaywrightCrawler` constructor. + +- The `PlaywrightBrowserPlugin` manages how browsers are launched and how browser contexts are created. It accepts [browser launch](https://playwright.dev/python/docs/api/class-browsertype#browser-type-launch) and [new context](https://playwright.dev/python/docs/api/class-browser#browser-new-context) options. +- The `BrowserPool` manages the lifecycle of browser instances (launching, recycling, etc.). You can customize its behavior to suit your needs. + +## Managing multiple browsers + +The `BrowserPool` allows you to manage multiple browsers. Each browser instance is managed by a separate `PlaywrightBrowserPlugin` and can be configured independently. This is useful for scenarios like testing multiple configurations or implementing browser rotation to help avoid blocks or detect different site behaviors. + + + {MultipleLaunchExample} + + +## Browser launch and context configuration + +The `PlaywrightBrowserPlugin` provides access to all relevant Playwright configuration options for both [browser launches](https://playwright.dev/python/docs/api/class-browsertype#browser-type-launch) and [new browser contexts](https://playwright.dev/python/docs/api/class-browser#browser-new-context). You can specify these options in the constructor of `PlaywrightBrowserPlugin` or `PlaywrightCrawler`: + + + {BrowserConfigurationExample} + + +You can also configure each plugin used by `BrowserPool`: + + + {PluginBrowserConfigExample} + + +For an example of how to implement a custom browser plugin, see the [Camoufox example](../examples/playwright-crawler-with-camoufox). [Camoufox](https://camoufox.com/) is a stealth browser plugin designed to reduce detection by anti-scraping measures and is fully compatible with `PlaywrightCrawler`. + +## Page configuration with lifecycle page hooks + +For additional setup or event-driven actions around page creation and closure, the `BrowserPool` exposes four lifecycle hooks: `pre_page_create_hook`, `post_page_create_hook`, `pre_page_close_hook`, and `post_page_close_hook`. To use them, create a `BrowserPool` instance and pass it to `PlaywrightCrawler` via the `browser_pool` argument. + + + {BrowserPoolPageHooksExample} + + +## Navigation hooks + +Navigation hooks allow for additional configuration at specific points during page navigation. The `pre_navigation_hook` is called before each navigation and provides `PlaywrightPreNavCrawlingContext` - including the [page](https://playwright.dev/python/docs/api/class-page) instance and a `block_requests` helper for filtering unwanted resource types and URL patterns. See the [block requests example](https://crawlee.dev/python/docs/examples/playwright-crawler-with-block-requests) for a dedicated walkthrough. Similarly, the `post_navigation_hook` is called after each navigation and provides `PlaywrightPostNavCrawlingContext` - useful for post-load checks such as detecting CAPTCHAs or verifying page state. + + + {NavigationHooksExample} + + +## Conclusion + +This guide introduced the `PlaywrightCrawler` and explained how to configure it using `BrowserPool` and `PlaywrightBrowserPlugin`. You learned how to launch multiple browsers, configure browser and context settings, use `BrowserPool` lifecycle page hooks, and apply navigation hooks. If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/crawlee-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping! diff --git a/website/versioned_docs/version-1.6/guides/playwright_crawler_adaptive.mdx b/website/versioned_docs/version-1.6/guides/playwright_crawler_adaptive.mdx new file mode 100644 index 0000000000..7957b98015 --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/playwright_crawler_adaptive.mdx @@ -0,0 +1,94 @@ +--- +id: adaptive-playwright-crawler +title: Adaptive Playwright crawler +description: Learn how to use the Adaptive Playwright crawler to automatically switch between browser-based and HTTP-only crawling. +--- + +import ApiLink from '@site/src/components/ApiLink'; +import CodeBlock from '@theme/CodeBlock'; +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; + +import AdaptivePlaywrightCrawlerHandler from '!!raw-loader!roa-loader!./code_examples/playwright_crawler_adaptive/handler.py'; +import AdaptivePlaywrightCrawlerPreNavHooks from '!!raw-loader!roa-loader!./code_examples/playwright_crawler_adaptive/pre_nav_hooks.py'; + +import AdaptivePlaywrightCrawlerInitBeautifulSoup from '!!raw-loader!./code_examples/playwright_crawler_adaptive/init_beautifulsoup.py'; +import AdaptivePlaywrightCrawlerInitParsel from '!!raw-loader!./code_examples/playwright_crawler_adaptive/init_parsel.py'; +import AdaptivePlaywrightCrawlerInitPrediction from '!!raw-loader!./code_examples/playwright_crawler_adaptive/init_prediction.py'; + +An `AdaptivePlaywrightCrawler` is a combination of `PlaywrightCrawler` and some implementation of HTTP-based crawler such as `ParselCrawler` or `BeautifulSoupCrawler`. +It uses a more limited crawling context interface so that it is able to switch to HTTP-only crawling when it detects that it may bring a performance benefit. + +Detection is done based on the `RenderingTypePredictor` with default implementation `DefaultRenderingTypePredictor`. It predicts which crawling method should be used and learns from already crawled pages. + +## When to use AdaptivePlaywrightCrawler + +Use `AdaptivePlaywrightCrawler` in scenarios where some target pages have to be crawled with `PlaywrightCrawler`, but for others faster HTTP-based crawler is sufficient. This way, you can achieve lower costs when crawling multiple different websites. + +Another use case is performing selector-based data extraction without prior knowledge of whether the selector exists in the static page or is dynamically added by a code executed in a browsing client. + +## Request handler and adaptive context helpers + +Request handler for `AdaptivePlaywrightCrawler` works on special context type - `AdaptivePlaywrightCrawlingContext`. This context is sometimes created by HTTP-based sub crawler and sometimes by playwright based sub crawler. Due to its dynamic nature, you can't always access [page](https://playwright.dev/python/docs/api/class-page) object. To overcome this limitation, there are three helper methods on this context that can be called regardless of how the context was created. + +`wait_for_selector` accepts `css` selector as first argument and timeout as second argument. The function will try to locate this selector a return once it is found(within timeout). In practice this means that if HTTP-based sub crawler was used, the function will find the selector only if it is part of the static content. If not, the adaptive crawler will fall back to the playwright sub crawler and will wait try to locate the selector within the timeout using playwright. + +`query_selector_one` accepts `css` selector as first argument and timeout as second argument. This function acts similar to `wait_for_selector`, but it also returns one selector if any selector is found. Return value type is determined by used HTTP-based sub crawler. For example, it will be `Selector` for `ParselCrawler` and `Tag` for `BeautifulSoupCrawler`. + +`query_selector_all` same as `query_selector_one`, but returns all found selectors. + +`parse_with_static_parser` will re-parse the whole page. Return value type is determined by used HTTP-based sub crawler. It has optional arguments: `selector` and `timeout`. If those optional arguments are used then the function first calls `wait_for_selector` and then do the parsing. This can be used in scenario where some specific element can signal, that page is already complete. + +See the following example about how to create request handler and use context helpers: + + + {AdaptivePlaywrightCrawlerHandler} + + +## Crawler configuration + +To use `AdaptivePlaywrightCrawler` it is recommended to use one of the prepared factory methods that will create the crawler with specific HTTP-based sub crawler variant: `AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser` or `AdaptivePlaywrightCrawler.with_parsel_static_parser`. + +`AdaptivePlaywrightCrawler` is internally composed of two sub crawlers and you can do a detailed configuration of both of them. For detailed configuration options of the sub crawlers, please refer to their pages: `PlaywrightCrawler`, `ParselCrawler`, `BeautifulSoupCrawler`. + +In the following example you can see how to create and configure `AdaptivePlaywrightCrawler` with two different HTTP-based sub crawlers: + + + + + {AdaptivePlaywrightCrawlerInitBeautifulSoup} + + + + + {AdaptivePlaywrightCrawlerInitParsel} + + + + +### Prediction related arguments + +To control which pages are crawled by which method you can use following arguments: + +`RenderingTypePredictor` - Class that can give recommendations about which sub crawler should be used for specific url. Predictor will also recommend to use both sub crawlers for some page from time to time, to check that the given recommendation was correct. Predictor should be able to learn from previous results and gradually give more reliable recommendations. + +`result_checker` - Is a function that checks result created from crawling a page. By default, it always returns `True`. + +`result_comparator` - Is a function that compares two results (HTTP-based sub crawler result and playwright based sub crawler result) and returns `True` if they are considered the same. By default, this function compares calls of context helper `push_data` by each sub crawler. This function is used by `rendering_type_predictor` to evaluate whether HTTP-based crawler has the same results as playwright based sub crawler. + +See the following example about how to pass prediction related arguments: + + + {AdaptivePlaywrightCrawlerInitPrediction} + + +## Page configuration with pre-navigation hooks + +In some use cases, you may need to configure the [page](https://playwright.dev/python/docs/api/class-page) before it navigates to the target URL. For instance, you might set navigation timeouts or manipulate other page-level settings. For such cases you can use the `pre_navigation_hook` method of the `AdaptivePlaywrightCrawler`. This method is called before the page navigates to the target URL and allows you to configure the page instance. Due to the dynamic nature of `AdaptivePlaywrightCrawler` it is possible that the hook will be executed for HTTP-based sub crawler or playwright-based sub crawler. Using [page](https://playwright.dev/python/docs/api/class-page) object for hook that will be executed on HTTP-based sub crawler will raise an exception. To overcome this you can use optional argument `playwright_only` = `True` when registering the hook. + +See the following example about how to register the pre navigation hooks: + + + {AdaptivePlaywrightCrawlerPreNavHooks} + diff --git a/website/versioned_docs/version-1.6/guides/playwright_crawler_stagehand.mdx b/website/versioned_docs/version-1.6/guides/playwright_crawler_stagehand.mdx new file mode 100644 index 0000000000..59a34b4cd2 --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/playwright_crawler_stagehand.mdx @@ -0,0 +1,66 @@ +--- +id: playwright-crawler-stagehand +title: Playwright with Stagehand +description: How to integrate Stagehand AI-powered automation with PlaywrightCrawler. +--- + +import ApiLink from '@site/src/components/ApiLink'; +import CodeBlock from '@theme/CodeBlock'; + +import SupportClasses from '!!raw-loader!./code_examples/playwright_crawler_stagehand/support_classes.py'; +import BrowserClasses from '!!raw-loader!./code_examples/playwright_crawler_stagehand/browser_classes.py'; +import StagehandRun from '!!raw-loader!./code_examples/playwright_crawler_stagehand/stagehand_run.py'; + +[Stagehand](https://docs.stagehand.dev/) is a framework that combines [Playwright](https://playwright.dev/python/) with AI-driven natural language understanding and decision-making capabilities. With Stagehand, you can use natural language instructions to interact with web pages instead of writing complex selectors and automation logic. + +Stagehand supports multiple AI models through [`LiteLLM`](https://docs.litellm.ai/docs/). This guide demonstrates how to integrate Stagehand with `PlaywrightCrawler` using [Gemini](https://ai.google.dev/gemini-api/docs) as the AI model provider. + +:::info + +This guide is based on stagehand-python v0.4.0 with local configuration settings and may not be compatible with newer versions. + +::: + +## Get Gemini API key + +You need to register with [Google AI Studio](https://aistudio.google.com/) and navigate to [Get API key](https://aistudio.google.com/app/apikey) to obtain your API key. + +## Create support classes for Stagehand + +To integrate Stagehand with Crawlee, you need to create wrapper classes that allow `PlaywrightBrowserPlugin` to manage the Playwright lifecycle. + +Create `CrawleeStagehand` - a custom Stagehand subclass that overrides the `init` method to prevent Stagehand from launching its own Playwright instance. + +Create `CrawleeStagehandPage` - a wrapper class for `StagehandPage` that implements the [Playwright Page](https://playwright.dev/python/docs/next/api/class-page) behavior expected by `PlaywrightCrawler`. + + + {SupportClasses} + + +## Create browser integration classes + +You need to create a custom browser plugin and controller that properly initialize Stagehand and obtain browser pages from `StagehandContext`. + +Create `StagehandPlugin` - a subclass of `PlaywrightBrowserPlugin` that holds the Stagehand instance and creates `PlaywrightPersistentBrowser` instances. + +Create `StagehandBrowserController` - a subclass of `PlaywrightBrowserController` that lazily initializes `StagehandContext` and creates new pages with AI capabilities on demand. + + + {BrowserClasses} + + +## Create a crawler + +Now you can create a `PlaywrightCrawler` that uses Stagehand's AI capabilities to interact with web pages using natural language commands: + + + {StagehandRun} + + +The integration works through several key components: +- `CrawleeStagehand` prevents Stagehand from launching its own Playwright instance, allowing Crawlee to manage the browser lifecycle +- `StagehandPlugin` extends the Playwright browser plugin to create Stagehand-enabled browser instances +- `StagehandBrowserController` uses `StagehandContext` to create pages with AI capabilities +- `CrawleeStagehandPage` provides interface compatibility between Stagehand pages and Crawlee's expectations + +In the request handler, you can use natural language commands like `page.extract('Extract title page')` to perform intelligent data extraction without writing complex selectors. diff --git a/website/versioned_docs/version-1.6/guides/proxy_management.mdx b/website/versioned_docs/version-1.6/guides/proxy_management.mdx new file mode 100644 index 0000000000..38385ac950 --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/proxy_management.mdx @@ -0,0 +1,120 @@ +--- +id: proxy-management +title: Proxy management +description: Using proxies to get around those annoying IP-blocks +--- + +import ApiLink from '@site/src/components/ApiLink'; +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +import CodeBlock from '@theme/CodeBlock'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; + +import QuickStartExample from '!!raw-loader!roa-loader!./code_examples/proxy_management/quick_start_example.py'; +import IntegrationBsExample from '!!raw-loader!roa-loader!./code_examples/proxy_management/integration_bs_example.py'; +import IntegrationPwExample from '!!raw-loader!roa-loader!./code_examples/proxy_management/integration_pw_example.py'; +import TiersBsExample from '!!raw-loader!roa-loader!./code_examples/proxy_management/tiers_bs_example.py'; +import TiersPwExample from '!!raw-loader!roa-loader!./code_examples/proxy_management/tiers_pw_example.py'; +import InspectionBsExample from '!!raw-loader!roa-loader!./code_examples/proxy_management/inspecting_bs_example.py'; +import InspectionPwExample from '!!raw-loader!roa-loader!./code_examples/proxy_management/inspecting_pw_example.py'; + +import SessionBsExample from '!!raw-loader!./code_examples/proxy_management/session_bs_example.py'; +import SessionPwExample from '!!raw-loader!./code_examples/proxy_management/session_pw_example.py'; + +[IP address blocking](https://en.wikipedia.org/wiki/IP_address_blocking) is one of the oldest and most effective ways of preventing access to a website. It is therefore paramount for a good web scraping library to provide easy to use but powerful tools which can work around IP blocking. The most powerful weapon in our anti IP blocking arsenal is a [proxy server](https://en.wikipedia.org/wiki/Proxy_server). + +With Crawlee we can use our own proxy servers or proxy servers acquired from third-party providers. + +[//]: # (Check out the [avoid blocking guide](./avoid-blocking) for more information about blocking.) + +## Quick start + +If you already have proxy URLs of your own, you can start using them immediately in only a few lines of code. + + + {QuickStartExample} + + +Examples of how to use our proxy URLs with crawlers are shown below in [Crawler integration](#crawler-integration) section. + +## Proxy configuration + +All our proxy needs are managed by the `ProxyConfiguration` class. We create an instance using the `ProxyConfiguration` constructor function based on the provided options. + +### Crawler integration + +`ProxyConfiguration` integrates seamlessly into `BeautifulSoupCrawler` and `PlaywrightCrawler`. + + + + + {IntegrationBsExample} + + + + + {IntegrationPwExample} + + + + +Our crawlers will now use the selected proxies for all connections. + +### IP Rotation and session management + +The `proxy_configuration.new_url()` method allows us to pass a `session_id` parameter. This creates a `session_id`-`proxy_url` pair, ensuring that subsequent `new_url()` calls with the same `session_id` return the same `proxy_url`. This is extremely useful in scraping, because we want to create the impression of a real user. See the `SessionPool` class for more information on how maintaining a real session helps avoid blocking. + +For more details on session management, check out the [Session management](./session-management) guide. + +When no `session_id` is provided, our proxy URLs are rotated round-robin. + + + + + {SessionBsExample} + + + + + {SessionPwExample} + + + + +### Tiered proxies + +When you use HTTP proxies in real world crawling scenarios, you have to decide which type of proxy to use to reach the sweet spot between cost efficiency and reliably avoiding blocking. Some websites may allow crawling with no proxy, on some you may get away with using datacenter proxies, which are cheap but easily detected, and sometimes you need to use expensive residential proxies. + +To take the guesswork out of this process, Crawlee allows you to configure multiple tiers of proxy URLs. When crawling, it will automatically pick the lowest tier (smallest index) where it doesn't encounter blocking. If you organize your proxy server URLs in tiers so that the lowest tier contains the cheapest, least reliable ones and each higher tier contains more expensive, more reliable ones, you will get an optimal anti-blocking performance. + +In an active tier, Crawlee will alternate between proxies in a round-robin fashion, just like it would with `proxy_urls`. + + + + + {TiersBsExample} + + + + + {TiersPwExample} + + + + +## Inspecting current proxy in crawlers + +The `BeautifulSoupCrawler` and `PlaywrightCrawler` provide access to information about the currently used proxy via the request handler using a `proxy_info` object. This object allows easy access to the proxy URL. + + + + + {InspectionBsExample} + + + + + {InspectionPwExample} + + + diff --git a/website/versioned_docs/version-1.6/guides/request_loaders.mdx b/website/versioned_docs/version-1.6/guides/request_loaders.mdx new file mode 100644 index 0000000000..2c5607c8ff --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/request_loaders.mdx @@ -0,0 +1,203 @@ +--- +id: request-loaders +title: Request loaders +description: How to manage the requests your crawler will go through. +--- + +import ApiLink from '@site/src/components/ApiLink'; +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; + +import RlBasicExample from '!!raw-loader!roa-loader!./code_examples/request_loaders/rl_basic_example.py'; +import SitemapExample from '!!raw-loader!roa-loader!./code_examples/request_loaders/sitemap_basic_example.py'; +import RlTandemExample from '!!raw-loader!roa-loader!./code_examples/request_loaders/rl_tandem_example.py'; +import RlExplicitTandemExample from '!!raw-loader!roa-loader!./code_examples/request_loaders/rl_tandem_example_explicit.py'; +import SitemapTandemExample from '!!raw-loader!roa-loader!./code_examples/request_loaders/sitemap_tandem_example.py'; +import SitemapExplicitTandemExample from '!!raw-loader!roa-loader!./code_examples/request_loaders/sitemap_tandem_example_explicit.py'; +import RlBasicPersistExample from '!!raw-loader!roa-loader!./code_examples/request_loaders/rl_basic_example_with_persist.py'; +import SitemapPersistExample from '!!raw-loader!roa-loader!./code_examples/request_loaders/sitemap_example_with_persist.py'; + +The [`request_loaders`](https://github.com/apify/crawlee-python/tree/master/src/crawlee/request_loaders) sub-package extends the functionality of the `RequestQueue`, providing additional tools for managing URLs and requests. If you are new to Crawlee and unfamiliar with the `RequestQueue`, consider starting with the [Storages](https://crawlee.dev/python/docs/guides/storages) guide first. Request loaders define how requests are fetched and stored, enabling various use cases such as reading URLs from files, external APIs, or combining multiple sources together. + +## Overview + +The [`request_loaders`](https://github.com/apify/crawlee-python/tree/master/src/crawlee/request_loaders) sub-package introduces the following abstract classes: + +- `RequestLoader`: The base interface for reading requests in a crawl. +- `RequestManager`: Extends `RequestLoader` with write capabilities. +- `RequestManagerTandem`: Combines a read-only `RequestLoader` with a writable `RequestManager`. + +And specific request loader implementations: + +- `RequestList`: A lightweight implementation for managing a static list of URLs. +- `SitemapRequestLoader`: A specialized loader that reads URLs from XML and plain-text sitemaps following the [Sitemaps protocol](https://www.sitemaps.org/protocol.html) with filtering capabilities. + +Below is a class diagram that illustrates the relationships between these components and the `RequestQueue`: + +```mermaid +--- +config: + class: + hideEmptyMembersBox: true +--- + +classDiagram + +%% ======================== +%% Abstract classes +%% ======================== + +class Storage { + <> + + id + + name + + open() + + drop() +} + +class RequestLoader { + <> + + handled_count + + total_count + + fetch_next_request() + + mark_request_as_handled() + + is_empty() + + is_finished() + + to_tandem() +} + +class RequestManager { + <> + + add_request() + + add_requests_batched() + + reclaim_request() + + drop() +} + +%% ======================== +%% Specific classes +%% ======================== + +class RequestQueue + +class RequestList + +class SitemapRequestLoader + +class RequestManagerTandem + +%% ======================== +%% Inheritance arrows +%% ======================== + +Storage --|> RequestQueue +RequestManager --|> RequestQueue + +RequestLoader --|> RequestManager +RequestLoader --|> RequestList +RequestLoader --|> SitemapRequestLoader +RequestManager --|> RequestManagerTandem +``` + +## Request loaders + +The `RequestLoader` interface defines the foundation for fetching requests during a crawl. It provides abstract methods for basic operations like retrieving, marking, and checking the status of requests. Concrete implementations, such as `RequestList`, build on this interface to handle specific scenarios. You can create your own custom loader that reads from an external file, web endpoint, database, or any other specific data source. For more details, refer to the `RequestLoader` API reference. + +:::info NOTE +To learn how to use request loaders in your crawlers, see the [Request manager tandem](#request-manager-tandem) section below. +::: + +### Request list + +The `RequestList` can accept an asynchronous generator as input, allowing requests to be streamed rather than loading them all into memory at once. This can significantly reduce memory usage, especially when working with large sets of URLs. + +Here is a basic example of working with the `RequestList`: + + + {RlBasicExample} + + +### Request list with persistence + +The `RequestList` supports state persistence, allowing it to resume from where it left off after interruption. This is particularly useful for long-running crawls or when you need to pause and resume crawling later. + +To enable persistence, provide `persist_state_key` and optionally `persist_requests_key` parameters, and disable automatic cleanup by setting `purge_on_start = False` in the configuration. The `persist_state_key` saves the loader's progress, while `persist_requests_key` ensures that the request data doesn't change between runs. For more details on resuming interrupted crawls, see the [Resuming a paused crawl](../examples/resuming-paused-crawl) example. + + + {RlBasicPersistExample} + + +### Sitemap request loader + +The `SitemapRequestLoader` is a specialized request loader that reads URLs from sitemaps following the [Sitemaps protocol](https://www.sitemaps.org/protocol.html). It supports both XML and plain text sitemap formats. It's particularly useful when you want to crawl a website systematically by following its sitemap structure. + +:::note +The `SitemapRequestLoader` is designed specifically for sitemaps that follow the standard Sitemaps protocol. HTML pages containing links are not supported by this loader - those should be handled by regular crawlers using the `enqueue_links` functionality. +::: + +The loader supports filtering URLs using glob patterns and regular expressions, allowing you to include or exclude specific types of URLs. The `SitemapRequestLoader` provides streaming processing of sitemaps, ensuring efficient memory usage without loading the entire sitemap into memory. + + + {SitemapExample} + + +### Sitemap request loader with persistence + +Similarly, the `SitemapRequestLoader` supports state persistence to resume processing from where it left off. This is especially valuable when processing large sitemaps that may take considerable time to complete. + + + {SitemapPersistExample} + + +When using persistence with `SitemapRequestLoader`, make sure to use the context manager (`async with`) to properly save the state when the work is completed. + +## Request managers + +The `RequestManager` extends `RequestLoader` with write capabilities. In addition to reading requests, a request manager can add and reclaim them. This is essential for dynamic crawling projects where new URLs may emerge during the crawl process, or when certain requests fail and need to be retried. For more details, refer to the `RequestManager` API reference. + +## Request manager tandem + +The `RequestManagerTandem` class allows you to combine the read-only capabilities of a `RequestLoader` (like `RequestList`) with the read-write capabilities of a `RequestManager` (like `RequestQueue`). This is useful for scenarios where you need to load initial requests from a static source (such as a file or database) and dynamically add or retry requests during the crawl. Additionally, it provides deduplication capabilities, ensuring that requests are not processed multiple times. + +Under the hood, `RequestManagerTandem` checks whether the read-only loader still has pending requests. If so, each new request from the loader is transferred to the manager. Any newly added or reclaimed requests go directly to the manager side. + +### Request list with request queue + +This section describes the combination of the `RequestList` and `RequestQueue` classes. This setup is particularly useful when you have a static list of URLs that you want to crawl, but also need to handle dynamic requests discovered during the crawl process. The `RequestManagerTandem` class facilitates this combination, with the `RequestLoader.to_tandem` method available as a convenient shortcut. Requests from the `RequestList` are processed first by being enqueued into the default `RequestQueue`, which handles persistence and retries for failed requests. + + + + + {RlExplicitTandemExample} + + + + + {RlTandemExample} + + + + +### Sitemap request loader with request queue + +Similar to the `RequestList` example above, you can combine a `SitemapRequestLoader` with a `RequestQueue` using the `RequestManagerTandem` class. This setup is particularly useful when you want to crawl URLs from a sitemap while also handling dynamic requests discovered during the crawl process. URLs from the sitemap are processed first by being enqueued into the default `RequestQueue`, which handles persistence and retries for failed requests. + + + + + {SitemapExplicitTandemExample} + + + + + {SitemapTandemExample} + + + + +## Conclusion + +This guide explained the `request_loaders` sub-package, which extends the functionality of the `RequestQueue` with additional tools for managing URLs and requests. You learned about the `RequestLoader`, `RequestManager`, and `RequestManagerTandem` classes, as well as the `RequestList` and `SitemapRequestLoader` implementations. You also saw practical examples of how to work with these classes to handle various crawling scenarios. + +If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/crawlee-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping! diff --git a/website/versioned_docs/version-1.6/guides/request_router.mdx b/website/versioned_docs/version-1.6/guides/request_router.mdx new file mode 100644 index 0000000000..d9d7733abf --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/request_router.mdx @@ -0,0 +1,112 @@ +--- +id: request-router +title: Request router +description: Learn how to use the Router class to organize request handlers, error handlers, and pre-navigation hooks in Crawlee. +--- + +import ApiLink from '@site/src/components/ApiLink'; +import CodeBlock from '@theme/CodeBlock'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; + +import BasicRequestHandlers from '!!raw-loader!roa-loader!./code_examples/request_router/basic_request_handlers.py'; +import SimpleDefaultHandler from '!!raw-loader!roa-loader!./code_examples/request_router/simple_default_handler.py'; +import CustomRouterDefaultOnly from '!!raw-loader!roa-loader!./code_examples/request_router/custom_router_default_only.py'; +import HttpPreNavigation from '!!raw-loader!roa-loader!./code_examples/request_router/http_pre_navigation.py'; +import ErrorHandler from '!!raw-loader!roa-loader!./code_examples/request_router/error_handler.py'; +import FailedRequestHandler from '!!raw-loader!roa-loader!./code_examples/request_router/failed_request_handler.py'; +import PlaywrightPreNavigation from '!!raw-loader!roa-loader!./code_examples/request_router/playwright_pre_navigation.py'; +import AdaptiveCrawlerHandlers from '!!raw-loader!roa-loader!./code_examples/request_router/adaptive_crawler_handlers.py'; + +The `Router` class manages request flow and coordinates the execution of user-defined logic in Crawlee projects. It routes incoming requests to appropriate user-defined handlers based on labels, manages error scenarios, and provides hooks for pre-navigation execution. The `Router` serves as the orchestrator for all crawling operations, ensuring that each request is processed by the correct handler according to its type and label. + +## Request handlers + +Request handlers are user-defined functions that process individual requests and their corresponding responses. Each handler receives a crawling context as its primary argument, which provides access to the current request, response data, and utility methods for data extraction, link enqueuing, and storage operations. Handlers determine how different types of pages are processed and how data is extracted and stored. + +:::note + +The code examples in this guide use `ParselCrawler` for demonstration, but the `Router` works with all crawler types. + +::: + +### Built-in router + +Every crawler instance includes a built-in `Router` accessible through the `crawler.router` property. This approach simplifies initial setup and covers basic use cases where request routing requirements are straightforward. + + + {SimpleDefaultHandler} + + +The default handler processes all requests that either lack a label or have a label for which no specific handler has been registered. + +### Custom router + +Applications requiring explicit control over router configuration or router reuse across multiple crawler instances can create custom `Router` instances. Custom routers provide complete control over request routing configuration and enable modular application architecture. Router instances can be configured independently and attached to your crawler instances as needed. + +You can also implement a custom request router class from scratch or by inheriting from `Router`. This allows you to define custom routing logic or manage request handlers in a different way. + + + {CustomRouterDefaultOnly} + + +### Advanced routing by labels + +More complex crawling projects often require different processing logic for various page types. The router supports label-based routing, which allows registration of specialized handlers for specific content categories. This pattern enables clean separation of concerns and targeted processing logic for different URL patterns or content types. + + + {BasicRequestHandlers} + + +## Error handlers + +Crawlee provides error handling mechanisms to manage request processing failures. It distinguishes between recoverable errors that may succeed on retry and permanent failures that require alternative handling strategies. + +### Error handler + +The error handler executes when exceptions occur during request processing, before any retry attempts. This handler receives the error context and can implement custom recovery logic, modify request parameters, or determine whether the request should be retried. Error handlers enable control over failure scenarios and allow applications to implement error recovery strategies. + + + {ErrorHandler} + + +### Failed request handler + +The failed request handler executes when a request has exhausted all retry attempts and is considered permanently failed. This handler serves as the final opportunity to log failures, store failed requests for later analysis, create alternative requests, or implement fallback processing strategies. + + + {FailedRequestHandler} + + +## Pre-navigation hooks + +Pre-navigation hooks execute before each request is processed, providing opportunities to configure request parameters, modify browser settings, or implement request-specific optimizations. You can use pre-navigation hooks for example for viewport configuration, resource blocking, timeout management, header customization, custom proxy rotation, and request interception. + +### HTTP crawler + +HTTP crawlers support pre-navigation hooks that execute before making HTTP requests. These hooks enable request modification, header configuration, and other HTTP-specific optimizations. + + + {HttpPreNavigation} + + +### Playwright crawler + +Playwright crawlers provide extensive pre-navigation capabilities that allow browser page configuration before navigation. These hooks can modify browser behavior and configure page settings. + + + {PlaywrightPreNavigation} + + +### Adaptive Playwright crawler + +The `AdaptivePlaywrightCrawler` implements a dual-hook system with common hooks that execute for all requests and Playwright-specific hooks that execute only when browser automation is required. This is perfect for projects that need both static and dynamic content handling. + + + {AdaptiveCrawlerHandlers} + + +## Conclusion + +This guide introduced you to the `Router` class and how to organize your crawling logic. You learned how to use built-in and custom routers, implement request handlers with label-based routing, handle errors with error and failed request handlers, and configure pre-navigation hooks for different crawler types. + +If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/crawlee-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping! diff --git a/website/versioned_docs/version-1.6/guides/running_in_web_server.mdx b/website/versioned_docs/version-1.6/guides/running_in_web_server.mdx new file mode 100644 index 0000000000..63f907e616 --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/running_in_web_server.mdx @@ -0,0 +1,47 @@ +--- +id: running-in-web-server +title: Running in web server +description: Running in web server +--- + +import ApiLink from '@site/src/components/ApiLink'; +import CodeBlock from '@theme/CodeBlock'; + +import Crawler from '!!raw-loader!./code_examples/running_in_web_server/crawler.py'; +import Server from '!!raw-loader!./code_examples/running_in_web_server/server.py'; + + +Most of the time, Crawlee jobs are run as batch jobs. You have a list of URLs you want to scrape every week or you might want to scrape a whole website once per day. After the scrape, you send the data to your warehouse for analytics. Batch jobs are efficient because they can use Crawlee's built-in autoscaling to fully utilize the resources you have available. But sometimes you have a use-case where you need to return scrape data as soon as possible. There might be a user waiting on the other end so every millisecond counts. This is where running Crawlee in a web server comes in. + +We will build a simple HTTP server that receives a page URL and returns the page title in the response. + +## Set up a web server + +There are many popular web server frameworks for Python, such as [Flask](https://flask.palletsprojects.com/en/stable/), [Django](https://www.djangoproject.com/), [Pyramid](https://trypyramid.com/), ... In this guide, we will use the [FastAPI](https://fastapi.tiangolo.com/) to keep things simple. + +This will be our core server setup: + + + {Server} + + +The server has two endpoints. +- `/` - The index is just giving short description of the server with example link to the second endpoint. +- `/scrape` - This is the endpoint that receives a `url` parameter and returns the page title scraped from the URL + +To run the example server, make sure that you have installed the [fastapi[standard]](https://fastapi.tiangolo.com/#installation) and from the directory where the example code is located you can use the following command: +``` +fastapi dev server.py +``` + +## Create a crawler + +We will create a standard `ParselCrawler` and use the `keep_alive=true` option to keep the crawler running even if there are no requests currently in the `RequestQueue`. This way it will always be waiting for new requests to come in. + + + {Crawler} + + +Crawler is defined inside of [Lifespan](https://fastapi.tiangolo.com/advanced/events/#lifespan) which is a FastAPI way to run some start up/ teardown code for the app. There are two objects that we want to save to the app state so that they can be accessed in any endpoint through `request.state`: +- `crawler` holds instance of our crawler and allows the app to interact with it. +- `requests_to_results` is dictionary that is used to temporarily register expected results for each request and populate them when they are made available by the crawler. diff --git a/website/versioned_docs/version-1.6/guides/scaling_crawlers.mdx b/website/versioned_docs/version-1.6/guides/scaling_crawlers.mdx new file mode 100644 index 0000000000..5dce8ac640 --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/scaling_crawlers.mdx @@ -0,0 +1,49 @@ +--- +id: scaling-crawlers +title: Scaling crawlers +description: Learn how to scale your crawlers by controlling concurrency and limiting requests per minute. +--- + +import ApiLink from '@site/src/components/ApiLink'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; + +import MaxTasksPerMinuteExample from '!!raw-loader!roa-loader!./code_examples/scaling_crawlers/max_tasks_per_minute_example.py'; +import MinAndMaxConcurrencyExample from '!!raw-loader!roa-loader!./code_examples/scaling_crawlers/min_and_max_concurrency_example.py'; + +As we build our crawler, we may want to control how many tasks it performs at any given time. In other words, how many requests it makes to the web we are trying to scrape. Crawlee offers several options to fine-tune the number of parallel tasks, limit the number of requests per minute, and optimize scaling based on available system resources. + +:::tip + +All of these options are available across all crawlers provided by Crawlee. In this guide, we are using the `BeautifulSoupCrawler` as an example. You should also explore the `ConcurrencySettings`. + +::: + +## Max tasks per minute + +The `max_tasks_per_minute` setting in `ConcurrencySettings` controls how many total tasks the crawler can process per minute. It ensures that tasks are spread evenly throughout the minute, preventing a sudden burst at the `max_concurrency` limit followed by idle time. By default, this is set to `Infinity`, meaning the crawler can run at full speed, limited only by `max_concurrency`. Use this option if you want to throttle your crawler to avoid overwhelming the target website with continuous requests. + + + {MaxTasksPerMinuteExample} + + +## Minimum and maximum concurrency + +The `min_concurrency` and `max_concurrency` options in the `ConcurrencySettings` define the minimum and maximum number of parallel tasks that can run at any given time. By default, crawlers start with a single parallel task and gradually scale up to a maximum of concurrent requests. + +:::caution Avoid setting minimum concurrency too high + +If you set `min_concurrency` too high compared to the available system resources, the crawler may run very slowly or even crash. It is recommended to stick with the default value and let the crawler automatically adjust concurrency based on the system's available resources. + +::: + +## Desired concurrency + +The `desired_concurrency` option in the `ConcurrencySettings` specifies the initial number of parallel tasks to start with, assuming sufficient resources are available. It defaults to the same value as `min_concurrency`. + + + {MinAndMaxConcurrencyExample} + + +## Autoscaled pool + +The `AutoscaledPool` manages a pool of asynchronous, resource-intensive tasks that run in parallel. It automatically starts new tasks only when there is enough free CPU and memory. To monitor system resources, it leverages the `Snapshotter` and `SystemStatus` classes. If any task raises an exception, the error is propagated, and the pool is stopped. Every crawler uses an `AutoscaledPool` under the hood. diff --git a/website/versioned_docs/version-1.6/guides/service_locator.mdx b/website/versioned_docs/version-1.6/guides/service_locator.mdx new file mode 100644 index 0000000000..fe10ce50c2 --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/service_locator.mdx @@ -0,0 +1,136 @@ +--- +id: service-locator +title: Service locator +description: Crawlee's service locator is a central registry for global services, managing and providing access to them throughout the whole framework. +--- + +import ApiLink from '@site/src/components/ApiLink'; +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; + +import ServiceLocatorConfiguration from '!!raw-loader!roa-loader!./code_examples/service_locator/service_locator_configuration.py'; +import ServiceLocatorStorageClient from '!!raw-loader!roa-loader!./code_examples/service_locator/service_locator_storage_client.py'; +import ServiceLocatorEventManager from '!!raw-loader!roa-loader!./code_examples/service_locator/service_locator_event_manager.py'; + +import ServiceCrawlerConfiguration from '!!raw-loader!roa-loader!./code_examples/service_locator/service_crawler_configuration.py'; +import ServiceCrawlerStorageClient from '!!raw-loader!roa-loader!./code_examples/service_locator/service_crawler_storage_client.py'; +import ServiceCrawlerEventManager from '!!raw-loader!roa-loader!./code_examples/service_locator/service_crawler_event_manager.py'; + +import ServiceStorageConfiguration from '!!raw-loader!roa-loader!./code_examples/service_locator/service_storage_configuration.py'; +import ServiceStorageStorageClient from '!!raw-loader!roa-loader!./code_examples/service_locator/service_storage_storage_client.py'; + +import ServiceConflicts from '!!raw-loader!roa-loader!./code_examples/service_locator/service_conflicts.py'; + +The `ServiceLocator` is a central registry for global services. It manages and provides access to these services throughout the framework, ensuring their consistent configuration and across all components. + +The service locator manages three core services: `Configuration`, `EventManager`, and `StorageClient`. All services are initialized lazily with defaults when first accessed. + +## Services + +There are three core services that are managed by the service locator: + +### Configuration + +`Configuration` is a class that provides access to application-wide settings and parameters. It allows you to configure various aspects of Crawlee, such as timeouts, logging level, persistence intervals, and various other settings. The configuration can be set directly in the code or via environment variables. + +### StorageClient + +`StorageClient` is the backend implementation for storages in Crawlee. It provides a unified interface for `Dataset`, `KeyValueStore`, and `RequestQueue`, regardless of the underlying storage implementation. Storage clients were already explained in the storage clients section. + +Refer to the [Storage clients guide](./storage-clients) for more information about storage clients and how to use them. + +### EventManager + +`EventManager` is responsible for coordinating internal events in Crawlee. It allows you to register event listeners and emit events throughout the framework. Examples of such events aborting, migrating, system info, or browser-specific events like page created, page closed and more. It provides a way to listen to events and execute custom logic when certain events occur. + +## Service registration + +There are several ways to register services in Crawlee, depending on your use case and preferences. + +### Via service locator + +Services can be registered globally through the `ServiceLocator` before they are first accessed. There is a singleton `service_locator` instance that is used throughout the framework, making the services available to all components throughout the whole framework. + + + + + + {ServiceLocatorStorageClient} + + + + + + {ServiceLocatorConfiguration} + + + + + + {ServiceLocatorEventManager} + + + + + +### Via crawler constructors + +Alternatively services can be passed to the crawler constructors. They will be registered globally to the `ServiceLocator` under the hood, making them available to all components and reaching consistent configuration. + + + + + + {ServiceCrawlerStorageClient} + + + + + + {ServiceCrawlerConfiguration} + + + + + + {ServiceCrawlerEventManager} + + + + + +### Via storage constructors + +Alternatively, services can be provided when opening specific storage instances, which uses them only for that particular instance without affecting global configuration. + + + + + + {ServiceStorageStorageClient} + + + + + + {ServiceStorageConfiguration} + + + + + +## Conflict prevention + +Once a service has been retrieved from the service locator, attempting to set a different instance will raise a `ServiceConflictError` to prevent accidental configuration conflicts. + + + {ServiceConflicts} + + +## Conclusion + +The `ServiceLocator` is a tool for managing global services in Crawlee. It provides a consistent way to configure and access services throughout the framework, ensuring that all components have access to the same configuration and services. + +If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/crawlee-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping! diff --git a/website/versioned_docs/version-1.6/guides/session_management.mdx b/website/versioned_docs/version-1.6/guides/session_management.mdx new file mode 100644 index 0000000000..a3a1385db1 --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/session_management.mdx @@ -0,0 +1,94 @@ +--- +id: session-management +title: Session management +description: How to manage your cookies, proxy IP rotations and more. +--- + +import ApiLink from '@site/src/components/ApiLink'; +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; + +import BasicSource from '!!raw-loader!roa-loader!./code_examples/session_management/sm_basic.py'; +import HttpSource from '!!raw-loader!roa-loader!./code_examples/session_management/sm_http.py'; +import BeautifulSoupSource from '!!raw-loader!roa-loader!./code_examples/session_management/sm_beautifulsoup.py'; +import ParselSource from '!!raw-loader!roa-loader!./code_examples/session_management/sm_parsel.py'; +import PlaywrightSource from '!!raw-loader!roa-loader!./code_examples/session_management/sm_playwright.py'; +import StandaloneSource from '!!raw-loader!roa-loader!./code_examples/session_management/sm_standalone.py'; +import OneSession from '!!raw-loader!roa-loader!./code_examples/session_management/one_session_http.py'; +import MultiSessions from '!!raw-loader!roa-loader!./code_examples/session_management/multi_sessions_http.py'; + +The `SessionPool` class provides a robust way to manage the rotation of proxy IP addresses, cookies, and other custom settings in Crawlee. Its primary advantage is the ability to filter out blocked or non-functional proxies, ensuring that your scraper avoids retrying requests through known problematic proxies. + +Additionally, it enables storing information tied to specific IP addresses, such as cookies, authentication tokens, and custom headers. This association reduces the probability of detection and blocking by ensuring cookies and other identifiers are used consistently with the same IP address. + +Finally, it ensures even IP address rotation by randomly selecting sessions. This helps prevent overuse of a limited pool of available IPs, reducing the risk of IP bans and enhancing the efficiency of your scraper. + +For more details on configuring proxies, refer to the [Proxy management](./proxy-management) guide. + +Now, let's explore examples of how to use the `SessionPool` in different scenarios: +- with `BasicCrawler`; +- with `HttpCrawler`; +- with `BeautifulSoupCrawler`; +- with `ParselCrawler`; +- with `PlaywrightCrawler`; +- without a crawler (standalone usage to manage sessions manually). + + + + + {BasicSource} + + + + + {HttpSource} + + + + + {BeautifulSoupSource} + + + + + {ParselSource} + + + + + {PlaywrightSource} + + + + + {StandaloneSource} + + + + +These examples demonstrate the basics of configuring and using the `SessionPool`. + +Please, bear in mind that `SessionPool` requires some time to establish a stable pool of working IPs. During the initial setup, you may encounter errors as the pool identifies and filters out blocked or non-functional IPs. This stabilization period is expected and will improve over time. + +## Configuring a single session + +In some cases, you need full control over session usage. For example, when working with websites requiring authentication or initialization of certain parameters like cookies. + +When working with a site that requires authentication, we typically don't want multiple sessions with different browser fingerprints or client parameters accessing the site. In this case, we need to configure the `SessionPool` appropriately: + + + {OneSession} + + +## Binding requests to specific sessions + +In the previous example, there's one obvious limitation - you're restricted to only one session. + +In some cases, we need to achieve the same behavior but using multiple sessions in parallel, such as authenticating with different profiles or using different proxies. + +To do this, use the `session_id` parameter for the `Request` object to bind a request to a specific session: + + + {MultiSessions} + diff --git a/website/versioned_docs/version-1.6/guides/storage_clients.mdx b/website/versioned_docs/version-1.6/guides/storage_clients.mdx new file mode 100644 index 0000000000..d5aa8bb871 --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/storage_clients.mdx @@ -0,0 +1,555 @@ +--- +id: storage-clients +title: Storage clients +description: How to work with storage clients in Crawlee, including the built-in clients and how to create your own. +--- + +import ApiLink from '@site/src/components/ApiLink'; +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; +import CodeBlock from '@theme/CodeBlock'; + +import MemoryStorageClientBasicExample from '!!raw-loader!roa-loader!./code_examples/storage_clients/memory_storage_client_basic_example.py'; +import FileSystemStorageClientBasicExample from '!!raw-loader!roa-loader!./code_examples/storage_clients/file_system_storage_client_basic_example.py'; +import FileSystemStorageClientConfigurationExample from '!!raw-loader!roa-loader!./code_examples/storage_clients/file_system_storage_client_configuration_example.py'; +import CustomStorageClientExample from '!!raw-loader!roa-loader!./code_examples/storage_clients/custom_storage_client_example.py'; +import RegisteringStorageClientsExample from '!!raw-loader!roa-loader!./code_examples/storage_clients/registering_storage_clients_example.py'; +import SQLStorageClientBasicExample from '!!raw-loader!roa-loader!./code_examples/storage_clients/sql_storage_client_basic_example.py'; +import SQLStorageClientConfigurationExample from '!!raw-loader!./code_examples/storage_clients/sql_storage_client_configuration_example.py'; +import RedisStorageClientBasicExample from '!!raw-loader!./code_examples/storage_clients/redis_storage_client_basic_example.py'; +import RedisStorageClientConfigurationExample from '!!raw-loader!./code_examples/storage_clients/redis_storage_client_configuration_example.py'; + +Storage clients provide a unified interface for interacting with `Dataset`, `KeyValueStore`, and `RequestQueue`, regardless of the underlying implementation. They handle operations like creating, reading, updating, and deleting storage instances, as well as managing data persistence and cleanup. This abstraction makes it easy to switch between different environments, such as local development and cloud production setups. + +## Built-in storage clients + +Crawlee provides three main storage client implementations: + +- `FileSystemStorageClient` - Provides persistent file system storage with in-memory caching. +- `MemoryStorageClient` - Stores data in memory with no persistence. +- `SqlStorageClient` - Provides persistent storage using a SQL database ([SQLite](https://sqlite.org/), [PostgreSQL](https://www.postgresql.org/), [MySQL](https://www.mysql.com/) or [MariaDB](https://mariadb.org/)). Requires installing the extra dependency: `crawlee[sql_sqlite]` for SQLite, `crawlee[sql_postgres]` for PostgreSQL or `crawlee[sql_mysql]` for MySQL and MariaDB. +- `RedisStorageClient` - Provides persistent storage using a [Redis](https://redis.io/) database v8.0+. Requires installing the extra dependency `crawlee[redis]`. +- [`ApifyStorageClient`](https://docs.apify.com/sdk/python/reference/class/ApifyStorageClient) - Manages storage on the [Apify platform](https://apify.com), implemented in the [Apify SDK](https://github.com/apify/apify-sdk-python). + +```mermaid +--- +config: + class: + hideEmptyMembersBox: true +--- + +classDiagram + +%% ======================== +%% Abstract classes +%% ======================== + +class StorageClient { + <> +} + +%% ======================== +%% Specific classes +%% ======================== + +class FileSystemStorageClient + +class MemoryStorageClient + +class SqlStorageClient + +class RedisStorageClient + +class ApifyStorageClient + +%% ======================== +%% Inheritance arrows +%% ======================== + +StorageClient --|> FileSystemStorageClient +StorageClient --|> MemoryStorageClient +StorageClient --|> SqlStorageClient +StorageClient --|> RedisStorageClient +StorageClient --|> ApifyStorageClient +``` + +### File system storage client + +The `FileSystemStorageClient` provides persistent storage by writing data directly to the file system. It uses intelligent caching and batch processing for better performance while storing data in human-readable JSON format. This is the default storage client used by Crawlee when no other storage client is specified, making it ideal for large datasets and long-running operations where data persistence is required. + +:::warning Concurrency limitation +The `FileSystemStorageClient` is not safe for concurrent access from multiple crawler processes. Use it only when running a single crawler process at a time. +::: + +This storage client is ideal for large datasets, and long-running operations where data persistence is required. Data can be easily inspected and shared with other tools. + + + {FileSystemStorageClientBasicExample} + + +Configuration options for the `FileSystemStorageClient` can be set through environment variables or the `Configuration` class: + +- **`storage_dir`** (env: `CRAWLEE_STORAGE_DIR`, default: `'./storage'`) - The root directory for all storage data. +- **`purge_on_start`** (env: `CRAWLEE_PURGE_ON_START`, default: `True`) - Whether to purge default storages on start. + +Data is stored using the following directory structure: + +```text +{CRAWLEE_STORAGE_DIR}/ +โ”œโ”€โ”€ datasets/ +โ”‚ โ””โ”€โ”€ {DATASET_NAME}/ +โ”‚ โ”œโ”€โ”€ __metadata__.json +โ”‚ โ”œโ”€โ”€ 000000001.json +โ”‚ โ””โ”€โ”€ 000000002.json +โ”œโ”€โ”€ key_value_stores/ +โ”‚ โ””โ”€โ”€ {KVS_NAME}/ +โ”‚ โ”œโ”€โ”€ __metadata__.json +โ”‚ โ”œโ”€โ”€ key1.json +โ”‚ โ”œโ”€โ”€ key2.txt +โ”‚ โ””โ”€โ”€ key3.json +โ””โ”€โ”€ request_queues/ + โ””โ”€โ”€ {RQ_NAME}/ + โ”œโ”€โ”€ __metadata__.json + โ”œโ”€โ”€ {REQUEST_ID_1}.json + โ””โ”€โ”€ {REQUEST_ID_2}.json +``` + +Where: +- `{CRAWLEE_STORAGE_DIR}` - The root directory for local storage. +- `{DATASET_NAME}`, `{KVS_NAME}`, `{RQ_NAME}` - The unique names for each storage instance (defaults to `"default"`). +- Files are stored directly without additional metadata files for simpler structure. + +Here is an example of how to configure the `FileSystemStorageClient`: + + + {FileSystemStorageClientConfigurationExample} + + +### Memory storage client + +The `MemoryStorageClient` stores all data in memory using Python data structures. It provides fast access but does not persist data between runs, meaning all data is lost when the program terminates. This storage client is primarily suitable for testing and development, and is usually not a good fit for production use. However, in some cases where speed is prioritized over persistence, it can make sense. + +:::warning Persistence limitation +The `MemoryStorageClient` does not persist data between runs. All data is lost when the program terminates. +::: + + + {MemoryStorageClientBasicExample} + + +### SQL storage client + +:::warning Experimental feature +The `SqlStorageClient` is experimental. Its API and behavior may change in future releases. +::: + +The `SqlStorageClient` provides persistent storage using a SQL database (SQLite by default, or PostgreSQL, MySQL, MariaDB). It supports all Crawlee storage types and enables concurrent access from multiple independent clients or processes. + +:::note dependencies +The `SqlStorageClient` is not included in the core Crawlee package. +To use it, you need to install Crawlee with the appropriate extra dependency: + +- For SQLite support, run: + pip install 'crawlee[sql_sqlite]' +- For PostgreSQL support, run: + pip install 'crawlee[sql_postgres]' +- For MySQL or MariaDB support, run: + pip install 'crawlee[sql_mysql]' +::: + +By default, SqlStorageClient uses SQLite. +To use a different database, just provide the appropriate connection string via the `connection_string` parameter. No other code changes are neededโ€”the same client works for all supported databases. + + + {SQLStorageClientBasicExample} + + +Data is organized in relational tables. Below are the main tables and columns used for each storage type: + +```mermaid +--- +config: + class: + hideEmptyMembersBox: true +--- + +classDiagram + +%% ======================== +%% Storage Clients +%% ======================== + +class SqlDatasetClient { + <> +} + +class SqlKeyValueStoreClient { + <> +} + +%% ======================== +%% Dataset Tables +%% ======================== + +class datasets { + <> + + dataset_id (PK) + + internal_name + + name + + accessed_at + + created_at + + modified_at + + item_count + + buffer_locked_until +} + +class dataset_records { + <
> + + item_id (PK) + + dataset_id (FK) + + data +} + +class dataset_metadata_buffer { + <
> + + id (PK) + + accessed_at + + modified_at + + delta_item_count +} + +%% ======================== +%% Key-Value Store Tables +%% ======================== + +class key_value_stores { + <
> + + key_value_store_id (PK) + + internal_name + + name + + accessed_at + + created_at + + modified_at + + buffer_locked_until +} + +class key_value_store_records { + <
> + + key_value_store_id (FK, PK) + + key (PK) + + value + + content_type + + size +} + +class key_value_store_metadata_buffer { + <
> + + id (PK) + + accessed_at + + modified_at +} + +%% ======================== +%% Client to Table arrows +%% ======================== + +SqlDatasetClient --> datasets +SqlDatasetClient --> dataset_records +SqlDatasetClient --> dataset_metadata_buffer + +SqlKeyValueStoreClient --> key_value_stores +SqlKeyValueStoreClient --> key_value_store_records +SqlKeyValueStoreClient --> key_value_store_metadata_buffer +``` +```mermaid +--- +config: + class: + hideEmptyMembersBox: true +--- + +classDiagram + +%% ======================== +%% Storage Clients +%% ======================== + +class SqlRequestQueueClient { + <> +} + +%% ======================== +%% Request Queue Tables +%% ======================== + +class request_queues { + <
> + + request_queue_id (PK) + + internal_name + + name + + accessed_at + + created_at + + modified_at + + had_multiple_clients + + handled_request_count + + pending_request_count + + total_request_count + + buffer_locked_until +} + +class request_queue_records { + <
> + + request_id (PK) + + request_queue_id (FK, PK) + + data + + sequence_number + + is_handled + + time_blocked_until + + client_key +} + +class request_queue_state { + <
> + + request_queue_id (FK, PK) + + sequence_counter + + forefront_sequence_counter +} + +class request_queue_metadata_buffer { + <
> + + id (PK) + + accessed_at + + modified_at + + client_id + + delta_handled_count + + delta_pending_count + + delta_total_count + + need_recalc +} + +%% ======================== +%% Client to Table arrows +%% ======================== + +SqlRequestQueueClient --> request_queues +SqlRequestQueueClient --> request_queue_records +SqlRequestQueueClient --> request_queue_state +SqlRequestQueueClient --> request_queue_metadata_buffer +``` + +Configuration options for the `SqlStorageClient` can be set through environment variables or the `Configuration` class: + +- **`storage_dir`** (env: `CRAWLEE_STORAGE_DIR`, default: `'./storage'`) - The root directory where the default SQLite database will be created if no connection string is provided. +- **`purge_on_start`** (env: `CRAWLEE_PURGE_ON_START`, default: `True`) - Whether to purge default storages on start. + +Configuration options for the `SqlStorageClient` can be set via constructor arguments: + +- **`connection_string`** (default: SQLite in `Configuration` storage dir) - SQLAlchemy connection string, e.g. `sqlite+aiosqlite:///my.db`, `postgresql+asyncpg://user:pass@host/db`, `mysql+aiomysql://user:pass@host/db` or `mariadb+aiomysql://user:pass@host/db`. +- **`engine`** - Pre-configured SQLAlchemy AsyncEngine (optional). + +For advanced scenarios, you can configure `SqlStorageClient` with a custom SQLAlchemy engine and additional options via the `Configuration` class. This is useful, for example, when connecting to an external PostgreSQL database or customizing connection pooling. + +:::warning +If you use MySQL or MariaDB, pass the `isolation_level='READ COMMITTED'` argument to `create_async_engine`. MySQL/MariaDB default to the `REPEATABLE READ` isolation level, which can cause unnecessary locking, deadlocks, or stale reads when multiple Crawlee workers access the same tables concurrently. Using `READ COMMITTED` ensures more predictable row-level locking and visibility semantics for `SqlStorageClient`. +::: + + + {SQLStorageClientConfigurationExample} + + +### Redis storage client + +:::warning Experimental feature +The `RedisStorageClient` is experimental. Its API and behavior may change in future releases. +::: + +The `RedisStorageClient` provides persistent storage using [Redis](https://redis.io/) database. It supports concurrent access from multiple independent clients or processes and uses Redis native data structures for efficient operations. + +:::note dependencies +The `RedisStorageClient` is not included in the core Crawlee package. +To use it, you need to install Crawlee with the Redis extra dependency: + +pip install 'crawlee[redis]' + +Additionally, Redis version 8.0 or higher is required. +::: + +:::note Redis persistence +Data persistence in Redis depends on your [database configuration](https://redis.io/docs/latest/operate/oss_and_stack/management/persistence/). +::: + +The client requires either a Redis connection string or a pre-configured Redis client instance. Use a pre-configured client when you need custom Redis settings such as connection pooling, timeouts, or SSL/TLS encryption. + + + {RedisStorageClientBasicExample} + + +Data is organized using Redis key patterns. Below are the main data structures used for each storage type: + +```mermaid +--- +config: + class: + hideEmptyMembersBox: true +--- + +classDiagram + +%% ======================== +%% Storage Client +%% ======================== + +class RedisDatasetClient { + <> +} + +%% ======================== +%% Dataset Keys +%% ======================== + +class DatasetKeys { + datasets:[name]:items - JSON Array + datasets:[name]:metadata - JSON Object +} + +class DatasetsIndexes { + datasets:id_to_name - Hash + datasets:name_to_id - Hash +} + +%% ======================== +%% Client to Keys arrows +%% ======================== + +RedisDatasetClient --> DatasetKeys +RedisDatasetClient --> DatasetsIndexes +``` + +```mermaid +--- +config: + class: + hideEmptyMembersBox: true +--- + +classDiagram + +%% ======================== +%% Storage Clients +%% ======================== + +class RedisKeyValueStoreClient { + <> +} + +%% ======================== +%% Key-Value Store Keys +%% ======================== + +class KeyValueStoreKeys { + key_value_stores:[name]:items - Hash + key_value_stores:[name]:metadata_items - Hash + key_value_stores:[name]:metadata - JSON Object +} + +class KeyValueStoresIndexes { + key_value_stores:id_to_name - Hash + key_value_stores:name_to_id - Hash +} + +%% ======================== +%% Client to Keys arrows +%% ======================== + +RedisKeyValueStoreClient --> KeyValueStoreKeys +RedisKeyValueStoreClient --> KeyValueStoresIndexes +``` + +```mermaid +--- +config: + class: + hideEmptyMembersBox: true +--- + +classDiagram + +%% ======================== +%% Storage Clients +%% ======================== + +class RedisRequestQueueClient { + <> +} + +%% ======================== +%% Request Queue Keys +%% ======================== + +class RequestQueueKeys{ + request_queues:[name]:queue - List + request_queues:[name]:data - Hash + request_queues:[name]:in_progress - Hash + request_queues:[name]:added_bloom_filter - Bloom Filter | bloom queue_dedup_strategy + request_queues:[name]:handled_bloom_filter - Bloom Filter | bloom queue_dedup_strategy + request_queues:[name]:pending_set - Set | default queue_dedup_strategy + request_queues:[name]:handled_set - Set | default queue_dedup_strategy + request_queues:[name]:metadata - JSON Object +} + +class RequestQueuesIndexes { + request_queues:id_to_name - Hash + request_queues:name_to_id - Hash +} + +%% ======================== +%% Client to Keys arrows +%% ======================== + +RedisRequestQueueClient --> RequestQueueKeys +RedisRequestQueueClient --> RequestQueuesIndexes +``` + +Configuration options for the `RedisStorageClient` can be set through environment variables or the `Configuration` class: + +- **`purge_on_start`** (env: `CRAWLEE_PURGE_ON_START`, default: `True`) - Whether to purge default storages on start. + +Configuration options for the `RedisStorageClient` can be set via constructor arguments: + +- **`connection_string`** - Redis connection string, e.g. `redis://localhost:6379/0`. +- **`redis`** - Pre-configured Redis client instance (optional). + + + {RedisStorageClientConfigurationExample} + + +## Creating a custom storage client + +A storage client consists of two parts: the storage client factory and individual storage type clients. The `StorageClient` acts as a factory that creates specific clients (`DatasetClient`, `KeyValueStoreClient`, `RequestQueueClient`) where the actual storage logic is implemented. + +Here is an example of a custom storage client that implements the `StorageClient` interface: + + + {CustomStorageClientExample} + + +Custom storage clients can implement any storage logic, such as connecting to a database, using a cloud storage service, or integrating with other systems. They must implement the required methods for creating, reading, updating, and deleting data in the respective storages. + +## Registering storage clients + +Storage clients can be registered in multiple ways: +- **Globally** - Using the `ServiceLocator` or passing directly to the crawler. +- **Per storage** - When opening a specific storage instance like `Dataset`, `KeyValueStore`, or `RequestQueue`. + + + {RegisteringStorageClientsExample} + + +You can also register different storage clients for each storage instance, allowing you to use different backends for different storages. This is useful when you want to use a fast in-memory storage for `RequestQueue` while persisting scraping results in `Dataset` or `KeyValueStore`. + +## Conclusion + +Storage clients in Crawlee provide different backends for data storage. Use `MemoryStorageClient` for testing and fast operations without persistence, or `FileSystemStorageClient` for environments where data needs to persist. You can also create custom storage clients for specialized backends by implementing the `StorageClient` interface. + +If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/crawlee-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping! diff --git a/website/versioned_docs/version-1.6/guides/storages.mdx b/website/versioned_docs/version-1.6/guides/storages.mdx new file mode 100644 index 0000000000..ff38ecba26 --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/storages.mdx @@ -0,0 +1,239 @@ +--- +id: storages +title: Storages +description: How to work with storages in Crawlee, how to manage requests and how to store and retrieve scraping results. +--- + +import ApiLink from '@site/src/components/ApiLink'; +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; + +import OpeningExample from '!!raw-loader!roa-loader!./code_examples/storages/opening.py'; + +import RqBasicExample from '!!raw-loader!roa-loader!./code_examples/storages/rq_basic_example.py'; +import RqWithCrawlerExample from '!!raw-loader!roa-loader!./code_examples/storages/rq_with_crawler_example.py'; +import RqWithCrawlerExplicitExample from '!!raw-loader!roa-loader!./code_examples/storages/rq_with_crawler_explicit_example.py'; +import RqHelperAddRequestsExample from '!!raw-loader!roa-loader!./code_examples/storages/helper_add_requests_example.py'; +import RqHelperEnqueueLinksExample from '!!raw-loader!roa-loader!./code_examples/storages/helper_enqueue_links_example.py'; + +import DatasetBasicExample from '!!raw-loader!roa-loader!./code_examples/storages/dataset_basic_example.py'; +import DatasetWithCrawlerExample from '!!raw-loader!roa-loader!./code_examples/storages/dataset_with_crawler_example.py'; +import DatasetWithCrawlerExplicitExample from '!!raw-loader!roa-loader!./code_examples/storages/dataset_with_crawler_explicit_example.py'; + +import KvsBasicExample from '!!raw-loader!roa-loader!./code_examples/storages/kvs_basic_example.py'; +import KvsWithCrawlerExample from '!!raw-loader!roa-loader!./code_examples/storages/kvs_with_crawler_example.py'; +import KvsWithCrawlerExplicitExample from '!!raw-loader!roa-loader!./code_examples/storages/kvs_with_crawler_explicit_example.py'; + +import CleaningDoNotPurgeExample from '!!raw-loader!roa-loader!./code_examples/storages/cleaning_do_not_purge_example.py'; +import CleaningPurgeExplicitlyExample from '!!raw-loader!roa-loader!./code_examples/storages/cleaning_purge_explicitly_example.py'; + +Crawlee offers several storage types for managing and persisting your crawling data. Request-oriented storages, such as the `RequestQueue`, help you store and deduplicate URLs, while result-oriented storages, like `Dataset` and `KeyValueStore`, focus on storing and retrieving scraping results. This guide explains when to use each type, how to interact with them, and how to control their lifecycle. + +## Overview + +Crawlee's storage system consists of two main layers: +- **Storages** (`Dataset`, `KeyValueStore`, `RequestQueue`): High-level interfaces for interacting with different storage types. +- **Storage clients** (`MemoryStorageClient`, `FileSystemStorageClient`, etc.): Backend implementations that handle the actual data persistence and management. + +For more information about storage clients and their configuration, see the [Storage clients guide](./storage-clients). + +```mermaid +--- +config: + class: + hideEmptyMembersBox: true +--- + +classDiagram + +%% ======================== +%% Abstract classes +%% ======================== + +class Storage { + <> +} + +%% ======================== +%% Specific classes +%% ======================== + +class Dataset + +class KeyValueStore + +class RequestQueue + +%% ======================== +%% Inheritance arrows +%% ======================== + +Storage --|> Dataset +Storage --|> KeyValueStore +Storage --|> RequestQueue +``` + +### Named and unnamed storages + +Crawlee supports two types of storages: + +- **Named storages**: Persistent storages with a specific name that persist across runs. These are useful when you want to share data between different crawler runs or access the same storage from multiple places. +- **Unnamed storages**: Temporary storages identified by an alias that are scoped to a single run. These are automatically purged at the start of each run (when `purge_on_start` is enabled, which is the default). + +### Default storage + +Each storage type (`Dataset`, `KeyValueStore`, `RequestQueue`) has a default instance that can be accessed without specifying `id`, `name` or `alias`. Default unnamed storage is accessed by calling storage's `open` method without parameters. This is the most common way to use storages in simple crawlers. + + + {OpeningExample} + + +## Request queue + +The `RequestQueue` is the primary storage for URLs in Crawlee, especially useful for deep crawling. It supports dynamic addition of URLs, making it ideal for recursive tasks where URLs are discovered and added during the crawling process (e.g., following links across multiple pages). Each Crawlee project has a **default request queue**, which can be used to store URLs during a specific run. + +The following code demonstrates the usage of the `RequestQueue`: + + + + + {RqBasicExample} + + + + + {RqWithCrawlerExample} + + + + + {RqWithCrawlerExplicitExample} + + + + +### Request-related helpers + +Crawlee provides helper functions to simplify interactions with the `RequestQueue`: + +- The `add_requests` function allows you to manually add specific URLs to the configured request storage. In this case, you must explicitly provide the URLs you want to be added to the request storage. If you need to specify further details of the request, such as a `label` or `user_data`, you have to pass instances of the `Request` class to the helper. +- The `enqueue_links` function is designed to discover new URLs in the current page and add them to the request storage. It can be used with default settings, requiring no arguments, or you can customize its behavior by specifying link element selectors, choosing different enqueue strategies, or applying include/exclude filters to control which URLs are added. See [Crawl website with relative links](../examples/crawl-website-with-relative-links) example for more details. + + + + + {RqHelperAddRequestsExample} + + + + + {RqHelperEnqueueLinksExample} + + + + +### Request manager + +The `RequestQueue` implements the `RequestManager` interface, offering a unified API for interacting with various request storage types. This provides a unified way to interact with different request storage types. + +If you need custom functionality, you can create your own request storage by subclassing the `RequestManager` class and implementing its required methods. + +For a detailed explanation of the `RequestManager` and other related components, refer to the [Request loaders guide](https://crawlee.dev/python/docs/guides/request-loaders). + +## Dataset + +The `Dataset` is designed for storing structured data, where each entry has a consistent set of attributes, such as products in an online store or real estate listings. Think of a `Dataset` as a table: each entry corresponds to a row, with attributes represented as columns. Datasets are append-only, allowing you to add new records but not modify or delete existing ones. Every Crawlee project run is associated with a default dataset, typically used to store results specific to that crawler execution. However, using this dataset is optional. + +The following code demonstrates basic operations of the dataset: + + + + + {DatasetBasicExample} + + + + + {DatasetWithCrawlerExample} + + + + + {DatasetWithCrawlerExplicitExample} + + + + +### Dataset-related helpers + +Crawlee provides the following helper function to simplify interactions with the `Dataset`: + +- The `push_data` function allows you to manually add data to the dataset. You can optionally specify the dataset ID or its name. + +## Key-value store + +The `KeyValueStore` is designed to save and retrieve data records or files efficiently. Each record is uniquely identified by a key and is associated with a specific MIME type, making the `KeyValueStore` ideal for tasks like saving web page screenshots, PDFs, or tracking the state of crawlers. + +The following code demonstrates the usage of the `KeyValueStore`: + + + + + {KvsBasicExample} + + + + + {KvsWithCrawlerExample} + + + + + {KvsWithCrawlerExplicitExample} + + + + +To see a real-world example of how to get the input from the key-value store, see the [Screenshots](https://crawlee.dev/python/docs/examples/capture-screenshots-using-playwright) example. + +### Key-value store-related helpers + +Crawlee provides the following helper function to simplify interactions with the `KeyValueStore`: + +- The `get_key_value_store` function retrieves the key-value store for the current crawler run. If the KVS does not exist, it will be created. You can also specify the KVS's ID or its name. + +## Cleaning up the storages + +By default, Crawlee cleans up all unnamed storages (including the default one) at the start of each run, so every crawl begins with a clean state. This behavior is controlled by `Configuration.purge_on_start` (default: True). In contrast, named storages are never purged automatically and persist across runs. The exact behavior may vary depending on the storage client implementation. + +### When purging happens + +The cleanup occurs as soon as a storage is accessed: +- When opening a storage explicitly (e.g., `RequestQueue.open`, `Dataset.open`, `KeyValueStore.open`). +- When using helper functions that implicitly open storages (e.g., `push_data`). +- Automatically when `BasicCrawler.run` is invoked. + +### Disabling automatic purging + +To disable automatic purging, set `purge_on_start=False` in your configuration: + + + {CleaningDoNotPurgeExample} + + +### Manual purging + +Purge on start behavior just triggers the storage's `purge` method, which removes all data from the storage. If you want to purge the storage manually, you can do so by calling the `purge` method on the storage instance. Or if you want to delete the storage completely, you can call the `drop` method on the storage instance, which will remove the storage, including metadata and all its data. + + + {CleaningPurgeExplicitlyExample} + + +Note that purging behavior may vary between storage client implementations. For more details on storage configuration and client implementations, see the [Storage clients guide](./storage-clients). + +## Conclusion + +This guide introduced you to the different storage types available in Crawlee and how to interact with them. You learned about the distinction between named storages (persistent across runs) and unnamed storages with aliases (temporary and purged on start). You discovered how to manage requests using the `RequestQueue` and store and retrieve scraping results using the `Dataset` and `KeyValueStore`. You also learned how to use helper functions to simplify interactions with these storages and how to control storage cleanup behavior. + +If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/crawlee-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping! diff --git a/website/versioned_docs/version-1.6/guides/trace_and_monitor_crawlers.mdx b/website/versioned_docs/version-1.6/guides/trace_and_monitor_crawlers.mdx new file mode 100644 index 0000000000..5a063ca12c --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/trace_and_monitor_crawlers.mdx @@ -0,0 +1,52 @@ +--- +id: trace-and-monitor-crawlers +title: Trace and monitor crawlers +description: Learn how to instrument your crawlers with OpenTelemetry to trace request handling, identify bottlenecks, monitor performance, and visualize telemetry data using Jaeger for performance optimization. +--- + +import ApiLink from '@site/src/components/ApiLink'; +import CodeBlock from '@theme/CodeBlock'; + +import InstrumentCrawler from '!!raw-loader!./code_examples/trace_and_monitor_crawlers/instrument_crawler.py'; + +[OpenTelemtery](https://opentelemetry.io/) is a collection of APIs, SDKs, and tools to instrument, generate, collect, and export telemetry data (metrics, logs, and traces) to help you analyze your softwareโ€™s performance and behavior. In the context of crawler development, it can be used to better understand how the crawler internally works, identify bottlenecks, debug, log metrics, and more. The topic described in this guide requires at least a basic understanding of OpenTelemetry. A good place to start is [What is open telemetry](https://opentelemetry.io/docs/what-is-opentelemetry/). + +In this guide, it will be shown how to set up OpenTelemetry and instrument a specific crawler to see traces of individual requests that are being processed by the crawler. OpenTelemetry on its own does not provide out of the box tool for convenient visualisation of the exported data (apart from printing to the console), but there are several good available tools to do that. In this guide, we will use [Jaeger](https://www.jaegertracing.io/) to visualise the telemetry data. To better understand concepts such as exporter, collector, and visualisation backend, please refer to the [OpenTelemetry documentation](https://opentelemetry.io/docs/collector/). + +## Set up the Jaeger + +This guide will show how to set up the environment locally to run the example code and visualize the telemetry data in Jaeger that will be running locally in a [docker](https://www.docker.com/) container. + +To start the preconfigured Docker container, you can use the following command: + +```bash +docker run -d --name jaeger -e COLLECTOR_OTLP_ENABLED=true -p 16686:16686 -p 4317:4317 -p 4318:4318 jaegertracing/all-in-one:latest +``` +For more details about the Jaeger setup, see the [getting started](https://www.jaegertracing.io/docs/2.7/getting-started/) section in their documentation. +You can see the Jaeger UI in your browser by navigating to http://localhost:16686 + +## Instrument the Crawler + +Now you can proceed with instrumenting the crawler to send the telemetry data to Jaeger and running it. To have the Python environment ready, you should install either **crawlee[all]** or **crawlee[otel]**, This will ensure that OpenTelemetry dependencies are installed, and you can run the example code snippet. +In the following example, you can see the function `instrument_crawler` that contains the instrumentation setup and is called before the crawler is started. If you have already set up the Jaeger, then you can just run the following code snippet. + + + {InstrumentCrawler} + + +## Analyze the results + +In the Jaeger UI, you can search for different traces, apply filtering, compare traces, view their detailed attributes, view timing details, and more. For the detailed description of the tool's capabilities, please refer to the [Jaeger documentation](https://www.jaegertracing.io/docs/1.47/deployment/frontend-ui/#trace-page). + +![Jaeger search view](/img/guides/jaeger_otel_search_view_example.png 'Example visualisation of search view in Jaeger') +![Jaeger trace view](/img/guides/jaeger_otel_trace_example.png 'Example visualisation of crawler request trace in Jaeger') + +You can use different tools to consume the OpenTelemetry data that might better suit your needs. Please see the list of known Vendors in [OpenTelemetry documentation](https://opentelemetry.io/ecosystem/vendors/). + +## Customize the instrumentation + +You can customize the `CrawlerInstrumentor`. Depending on the arguments used during its initialization, the instrumentation will be applied to different parts of the Crawlee code. By default, it instruments some functions that can give quite a good picture of each individual request handling. To turn this default instrumentation off, you can pass `request_handling_instrumentation=False` during initialization. You can also extend instrumentation by passing `instrument_classes=[...]` initialization argument that contains classes you want to be auto-instrumented. All their public methods will be automatically instrumented. Bear in mind that instrumentation has some runtime costs as well. The more instrumentation is used, the more overhead it will add to the crawler execution. + +You can also create your instrumentation by selecting only the methods you want to instrument. For more details, see the `CrawlerInstrumentor` source code and the [Python documentation for OpenTelemetry](https://opentelemetry.io/docs/languages/python/). + +If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/crawlee-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). diff --git a/website/versioned_docs/version-1.6/introduction/01_setting_up.mdx b/website/versioned_docs/version-1.6/introduction/01_setting_up.mdx new file mode 100644 index 0000000000..4c5215a576 --- /dev/null +++ b/website/versioned_docs/version-1.6/introduction/01_setting_up.mdx @@ -0,0 +1,153 @@ +--- +id: setting-up +title: Setting up +--- + +import ApiLink from '@site/src/components/ApiLink'; +import CodeBlock from '@theme/CodeBlock'; +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +This guide will help you get started with Crawlee by setting it up on your computer. Follow the steps below to ensure a smooth installation process. + +## Prerequisites + +Before installing Crawlee itself, make sure that your system meets the following requirements: + +- **Python 3.10 or higher**: Crawlee requires Python 3.10 or a newer version. You can download Python from the [official website](https://python.org/downloads/). +- **Python package manager**: While this guide uses [pip](https://pip.pypa.io/) (the most common package manager), you can also use any package manager you want. You can download pip from the [official website](https://pip.pypa.io/en/stable/installation/). + +### Verifying prerequisites + +To check if Python and pip are installed, run the following commands: + +```sh +python --version +``` + +```sh +python -m pip --version +``` + +If these commands return the respective versions, you're ready to continue. + +## Installing Crawlee + +Crawlee is available as [`crawlee`](https://pypi.org/project/crawlee/) package on PyPI. This package includes the core functionality, while additional features are available as optional extras to keep dependencies and package size minimal. + +### Basic installation + +To install the core package, run: + +```sh +python -m pip install crawlee +``` + +After installation, verify that Crawlee is installed correctly by checking its version: + +```sh +python -c 'import crawlee; print(crawlee.__version__)' +``` + +### Full installation + +If you do not mind the package size, you can run the following command to install Crawlee with all optional features: + +```sh +python -m pip install 'crawlee[all]' +``` + +### Installing specific extras + +Depending on your use case, you may want to install specific extras to enable additional functionality: + +For using the `BeautifulSoupCrawler`, install the `beautifulsoup` extra: + +```sh +python -m pip install 'crawlee[beautifulsoup]' +``` + +For using the `ParselCrawler`, install the `parsel` extra: + +```sh +python -m pip install 'crawlee[parsel]' +``` + +For using the `CurlImpersonateHttpClient`, install the `curl-impersonate` extra: + +```sh +python -m pip install 'crawlee[curl-impersonate]' +``` + +If you plan to use a (headless) browser with `PlaywrightCrawler`, install Crawlee with the `playwright` extra: + +```sh +python -m pip install 'crawlee[playwright]' +``` + +After installing the playwright extra, install the necessary Playwright dependencies: + +```sh +playwright install +``` + +### Installing multiple extras + +You can install multiple extras at once by using a comma as a separator: + +```sh +python -m pip install 'crawlee[beautifulsoup,curl-impersonate]' +``` + +## Start a new project + +The quickest way to get started with Crawlee is by using the Crawlee CLI and selecting one of the prepared templates. The CLI helps you set up a new project in seconds. + +### Using Crawlee CLI with uv + +First, ensure you have [uv](https://pypi.org/project/uv/) installed. You can check if it is installed by running: + +```sh +uv --version +``` + +If [uv](https://pypi.org/project/uv/) is not installed, follow the official [installation guide](https://docs.astral.sh/uv/getting-started/installation/). + +Then, run the Crawlee CLI using `uvx` and choose from the available templates: + +```sh +uvx 'crawlee[cli]' create my-crawler +``` + +### Using Crawlee CLI directly + +If you already have `crawlee` installed, you can spin it up by running: + +```sh +crawlee create my_crawler +``` + +Follow the interactive prompts in the CLI to choose a crawler type and set up your new project. + +### Running your project + +To run your newly created project, navigate to the project directory, activate the virtual environment, and execute the Python interpreter with the project module: + + + + cd my_crawler/ + source .venv/bin/activate + python -m my_crawler + + + cd my_crawler/ + venv\Scripts\activate + python -m my_crawler + + + +Congratulations! You have successfully set up and executed your first Crawlee project. + +## Next steps + +Next, you will learn how to create a very simple crawler and Crawlee components while building it. diff --git a/website/versioned_docs/version-1.6/introduction/02_first_crawler.mdx b/website/versioned_docs/version-1.6/introduction/02_first_crawler.mdx new file mode 100644 index 0000000000..203ab92146 --- /dev/null +++ b/website/versioned_docs/version-1.6/introduction/02_first_crawler.mdx @@ -0,0 +1,95 @@ +--- +id: first-crawler +title: First crawler +--- + +import ApiLink from '@site/src/components/ApiLink'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; + +import RequestQueueExample from '!!raw-loader!roa-loader!./code_examples/02_request_queue.py'; +import BeautifulSoupExample from '!!raw-loader!roa-loader!./code_examples/02_bs.py'; +import BeautifulSoupBetterExample from '!!raw-loader!roa-loader!./code_examples/02_bs_better.py'; + +Now, you will build your first crawler. But before you do, let's briefly introduce the Crawlee classes involved in the process. + +## How Crawlee works + +There are 3 main crawler classes available for use in Crawlee. + +- `BeautifulSoupCrawler` +- `ParselCrawler` +- `PlaywrightCrawler` + +We'll talk about their differences later. Now, let's talk about what they have in common. + +The general idea of each crawler is to go to a web page, open it, do some stuff there, save some results, continue to the next page, and repeat this process until the crawler's done its job. So the crawler always needs to find answers to two questions: _Where should I go?_ and _What should I do there?_ Answering those two questions is the only required setup. The crawlers have reasonable defaults for everything else. + +### The where - `Request` and `RequestQueue` + +All crawlers use instances of the `Request` class to determine where they need to go. Each request may hold a lot of information, but at the very least, it must hold a URL - a web page to open. But having only one URL would not make sense for crawling. Sometimes you have a pre-existing list of your own URLs that you wish to visit, perhaps a thousand. Other times you need to build this list dynamically as you crawl, adding more and more URLs to the list as you progress. Most of the time, you will use both options. + +The requests are stored in a `RequestQueue`, a dynamic queue of `Request` instances. You can seed it with start URLs and also add more requests while the crawler is running. This allows the crawler to open one page, extract interesting data, such as links to other pages on the same domain, add them to the queue (called _enqueuing_) and repeat this process to build a queue of virtually unlimited number of URLs. + +### The what - request handler + +In the request handler you tell the crawler what to do at each and every page it visits. You can use it to handle extraction of data from the page, processing the data, saving it, calling APIs, doing calculations and so on. + +The request handler is a user-defined function, invoked automatically by the crawler for each `Request` from the `RequestQueue`. It always receives a single argument - `BasicCrawlingContext` (or its descendants). Its properties change depending on the crawler class used, but it always includes the `request` property, which represents the currently crawled URL and related metadata. + +## Building a crawler + +Let's put the theory into practice and start with something easy. Visit a page and get its HTML title. In this tutorial, you'll scrape the Crawlee website [https://crawlee.dev](https://crawlee.dev), but the same code will work for any website. + +### Adding requests to the crawling queue + +Earlier you learned that the crawler uses a queue of requests as its source of URLs to crawl. Let's create it and add the first request. + + + {RequestQueueExample} + + +The `RequestQueue.add_request` method automatically converts the object with URL string to a `Request` instance. So now you have a `RequestQueue` that holds one request which points to `https://crawlee.dev`. + +:::tip Bulk add requests + +The code above is for illustration of the request queue concept. Soon you'll learn about the `BasicCrawler.add_requests` method which allows you to skip this initialization code, and it also supports adding a large number of requests without blocking. + +::: + +### Building a BeautifulSoupCrawler + +Crawlee comes with three main crawler classes: `BeautifulSoupCrawler`, `ParselCrawler`, and `PlaywrightCrawler`. You can read their short descriptions in the [Quick start](../quick-start) lesson. + +Unless you have a good reason to start with a different one, you should try building a `BeautifulSoupCrawler` first. It is an HTTP crawler with HTTP2 support, anti-blocking features and integrated HTML parser - [BeautifulSoup](https://pypi.org/project/beautifulsoup4/). It's fast, simple, cheap to run and does not require complicated dependencies. The only downside is that it won't work out of the box for websites which require JavaScript rendering. But you might not need JavaScript rendering at all, because many modern websites use server-side rendering. + +Let's continue with the earlier `RequestQueue` example. + + + {BeautifulSoupExample} + + +When you run the example, you will see the title of https://crawlee.dev printed to the log. What really happens is that `BeautifulSoupCrawler` first makes an HTTP request to `https://crawlee.dev`, then parses the received HTML with BeautifulSoup and makes it available as the `context` argument of the request handler. + +```log +[__main__] INFO The title of "https://crawlee.dev" is "Crawlee ยท Build reliable crawlers. Fast. | Crawlee". +``` + +### Add requests faster + +Earlier we mentioned that you'll learn how to use the `BasicCrawler.add_requests` method to skip the request queue initialization. It's simple. Every crawler has an implicit `RequestQueue` instance, and you can add requests to it with the `BasicCrawler.add_requests` method. In fact, you can go even further and just use the first parameter of `crawler.run()`! + + + {BeautifulSoupBetterExample} + + +When you run this code, you'll see exactly the same output as with the earlier, longer example. The `RequestQueue` is still there, it's just managed by the crawler automatically. + +:::info + +This method not only makes the code shorter, it will help with performance too! Internally it calls `RequestQueue.add_requests_batched` method. It will wait only for the initial batch of 1000 requests to be added to the queue before resolving, which means the processing will start almost instantly. After that, it will continue adding the rest of the requests in the background (again, in batches of 1000 items, once every second). + +::: + +## Next steps + +Next, you'll learn about crawling links. That means finding new URLs on the pages you crawl and adding them to the `RequestQueue` for the crawler to visit. diff --git a/website/versioned_docs/version-1.6/introduction/03_adding_more_urls.mdx b/website/versioned_docs/version-1.6/introduction/03_adding_more_urls.mdx new file mode 100644 index 0000000000..7583e3494e --- /dev/null +++ b/website/versioned_docs/version-1.6/introduction/03_adding_more_urls.mdx @@ -0,0 +1,120 @@ +--- +id: adding-more-urls +title: Adding more URLs +--- + +import ApiLink from '@site/src/components/ApiLink'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; + +import OriginalCodeExample from '!!raw-loader!roa-loader!./code_examples/03_original_code.py'; +import FindingNewLinksExample from '!!raw-loader!roa-loader!./code_examples/03_finding_new_links.py'; +import EnqueueStrategyExample from '!!raw-loader!roa-loader!./code_examples/03_enqueue_strategy.py'; +import GlobsExample from '!!raw-loader!roa-loader!./code_examples/03_globs.py'; +import TransformExample from '!!raw-loader!roa-loader!./code_examples/03_transform_request.py'; + +Previously you've built a very simple crawler that downloads HTML of a single page, reads its title and prints it to the console. This is the original source code: + + + {OriginalCodeExample} + + +Now you'll use the example from the previous section and improve on it. You'll add more URLs to the queue and thanks to that the crawler will keep going, finding new links, enqueuing them into the `RequestQueue` and then scraping them. + +## How crawling works + +The process is simple: + +1. Find new links on the page. +2. Filter only those pointing to the same domain, in this case [crawlee.dev](https://crawlee.dev/). +3. Enqueue (add) them to the `RequestQueue`. +4. Visit the newly enqueued links. +5. Repeat the process. + +In the following paragraphs you will learn about the `enqueue_links` function which simplifies crawling to a single function call. + +:::tip context awareness + +The `enqueue_links` function is context aware. It means that it will read the information about the currently crawled page from the context, and you don't need to explicitly provide any arguments. However, you can specify filtering criteria or an enqueuing strategy if desired. It will find the links and automatically add the links to the running crawler's `RequestQueue`. + +::: + +## Limit your crawls + +When you're just testing your code or when your crawler could potentially find millions of links, it's very useful to set a maximum limit of crawled pages. The option is called `max_requests_per_crawl`, is available in all crawlers, and you can set it like this: + +```python +crawler = BeautifulSoupCrawler(max_requests_per_crawl=10) +``` + +This means that no new requests will be started after the 20th request is finished. The actual number of processed requests might be a little higher thanks to parallelization, because the running requests won't be forcefully aborted. It's not even possible in most cases. + +## Finding new links + +There are numerous approaches to finding links to follow when crawling the web. For our purposes, we will be looking for `` elements that contain the `href` attribute because that's what you need in most cases. For example: + +```html +This is a link to Crawlee introduction +``` + +Since this is the most common case, it is also the `enqueue_links` default. + + + {FindingNewLinksExample} + + +If you need to override the default selection of elements in `enqueue_links`, you can use the `selector` argument. + +```python +await context.enqueue_links(selector='a.article-link') +``` + +## Filtering links to same domain + +Websites typically contain a lot of links that lead away from the original page. This is normal, but when crawling a website, we usually want to crawl that one site and not let our crawler wander away to Google, Facebook and Twitter. Therefore, we need to filter out the off-domain links and only keep the ones that lead to the same domain. + +```python +# The default behavior of enqueue_links is to stay on the same hostname, so it does not require +# any parameters. This will ensure the subdomain stays the same. +await context.enqueue_links() +``` + +The default behavior of `enqueue_links` is to stay on the same hostname. This **does not include subdomains**. To include subdomains in your crawl, use the `strategy` argument. The `strategy` argument is an instance of the `EnqueueStrategy` type alias. + + + {EnqueueStrategyExample} + + +When you run the code, you will see the crawler log the **title** of the first page, then the **enqueueing** message showing number of URLs, followed by the **title** of the first enqueued page and so on and so on. + +## Skipping duplicate URLs + +Skipping of duplicate URLs is critical, because visiting the same page multiple times would lead to duplicate results. This is automatically handled by the `RequestQueue` which deduplicates requests using their `unique_key`. This `unique_key` is automatically generated from the request's URL by lowercasing the URL, lexically ordering query parameters, removing fragments and a few other tweaks that ensure the queue only includes unique URLs. + +## Advanced filtering arguments + +While the defaults for `enqueue_links` can be often exactly what you need, it also gives you fine-grained control over which URLs should be enqueued. One way we already mentioned above. It is using the `EnqueueStrategy` type alias. You can use the `all` strategy if you want to follow every single link, regardless of its domain, or you can enqueue links that target the same domain name with the `same-domain` strategy. + +```python +# Wanders the internet. +await context.enqueue_links(strategy='all') +``` + +### Filter URLs with patterns + +For even more control, you can use the `include` or `exclude` parameters, either as glob patterns or regular expressions, to filter the URLs. Refer to the API documentation for `enqueue_links` for detailed information on these and other available options. + + + {GlobsExample} + + +### Transform requests before enqueuing + +For cases where you need to modify or filter requests before they are enqueued, you can use the `transform_request_function` parameter. This function receives a `RequestOptions` object and should return either a modified `RequestOptions` object, or a string of type `RequestTransformAction`, which only allows the values `skip` and `unchanged`. Returning `skip` means the request will be skipped, while `unchanged` will add it without any changes + + + {TransformExample} + + +## Next steps + +Next, you will start your project of scraping a production website and learn some more Crawlee tricks in the process. diff --git a/website/versioned_docs/version-1.6/introduction/04_real_world_project.mdx b/website/versioned_docs/version-1.6/introduction/04_real_world_project.mdx new file mode 100644 index 0000000000..61f6435980 --- /dev/null +++ b/website/versioned_docs/version-1.6/introduction/04_real_world_project.mdx @@ -0,0 +1,159 @@ +--- +id: real-world-project +title: Real-world project +--- + +import ApiLink from '@site/src/components/ApiLink'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; + +import SanityCheckExample from '!!raw-loader!roa-loader!./code_examples/04_sanity_check.py'; + +> _Hey, guys, you know, it's cool that we can scrape the `` elements of web pages, but that's not very useful. Can we finally scrape some real data and save it somewhere in a machine-readable format? Because that's why I started reading this tutorial in the first place!_ + +We hear you, young padawan! First, learn how to crawl, you must. Only then, walk through data, you can! + +## Making a production-grade crawler + +Making a production-grade crawler is not difficult, but there are many pitfalls of scraping that can catch you off guard. So for the real world project you'll learn how to scrape an [Warehouse store example](https://warehouse-theme-metal.myshopify.com/collections) instead of the Crawlee website. It contains a list of products of different categories, and each product has its own detail page. + +The website requires JavaScript rendering, which allows us to showcase more features of Crawlee. We've also added some helpful tips that prepare you for the real-world issues that you will surely encounter when scraping at scale. + +:::tip Not interested in theory? + +If you're not interested in crawling theory, feel free to [skip to the next chapter](./crawling) and get right back to coding. + +::: + +## Drawing a plan + +Sometimes scraping is really straightforward, but most of the time, it really pays off to do a bit of research first and try to answer some of these questions: + +- How is the website structured? +- Can I scrape it only with HTTP requests (read "with some <ApiLink to="class/HttpCrawler">`HttpCrawler`</ApiLink>, e.g. <ApiLink to="class/BeautifulSoupCrawler">`BeautifulSoupCrawler`</ApiLink>")? +- Do I need a headless browser for something? +- Are there any anti-scraping protections in place? +- Do I need to parse the HTML or can I get the data otherwise, such as directly from the website's API? + +For the purposes of this tutorial, let's assume that the website cannot be scraped with <ApiLink to="class/HttpCrawler">`HttpCrawler`</ApiLink>. It actually can, but we would have to dive a bit deeper than this introductory guide allows. So for now we will make things easier for you, scrape it with <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink>, and you'll learn about headless browsers in the process. + +## Choosing the data you need + +A good first step is to figure out what data you want to scrape and where to find it. For the time being, let's just agree that we want to scrape all products from all categories available on the [all collections page of the store](https://warehouse-theme-metal.myshopify.com/collections) and for each product we want to get its: + +- URL +- Manufacturer +- SKU +- Title +- Current price +- Stock available + +You will notice that some information is available directly on the list page, but for details such as "SKU" we'll also need to open the product's detail page. + +![data to scrape](/img/getting-started/scraping-practice.jpg 'Overview of data to be scraped.') + +### The start URL(s) + +This is where you start your crawl. It's convenient to start as close to the data as possible. For example, it wouldn't make much sense to start at https://warehouse-theme-metal.myshopify.com and look for a `collections` link there, when we already know that everything we want to extract can be found at the https://warehouse-theme-metal.myshopify.com/collections page. + +## Exploring the page + +Let's take a look at the https://warehouse-theme-metal.myshopify.com/collections page more carefully. There are some **categories** on the page, and each category has a list of **items**. On some category pages, at the bottom you will notice there are links to the next pages of results. This is usually called **the pagination**. + +### Categories and sorting + +When you click the categories, you'll see that they load a page of products filtered by that category. By going through a few categories and observing the behavior, we can also observe that we can sort by different conditions (such as `Best selling`, or `Price, low to high`), but for this example, we will not be looking into those. + +:::caution Limited pagination + +Be careful, because on some websites, like [amazon.com](https://amazon.com), this is not true and the sum of products in categories is actually larger than what's available without filters. Learn more in our [tutorial on scraping websites with limited pagination](https://docs.apify.com/tutorials/scrape-paginated-sites). + +::: + +### Pagination + +The pagination of the demo Warehouse Store is simple enough. When switching between pages, you will see that the URL changes to: + +```text +https://warehouse-theme-metal.myshopify.com/collections/headphones?page=2 +``` + +Try clicking on the link to page 4. You'll see that the pagination links update and show more pages. But can you trust that this will include all pages and won't stop at some point? + +:::caution Test your assumptions + +Similarly to the issue with filters explained above, the existence of pagination does not guarantee that you can simply paginate through all the results. Always test your assumptions about pagination. Otherwise, you might miss a chunk of results, and not even know about it. + +::: + +At the time of writing the `Headphones` collection results counter showed 75 results - products. Quick count of products on one page of results makes 24. 6 rows times 4 products. This means that there are 4 pages of results. + +If you're not convinced, you can visit a page somewhere in the middle, like `https://warehouse-theme-metal.myshopify.com/collections/headphones?page=2` and see how the pagination looks there. + +## The crawling strategy + +Now that you know where to start and how to find all the collection details, let's look at the crawling process. + +1. Visit the store page containing the list of categories (our start URL). +2. Enqueue all links to all categories. +3. Enqueue all product pages from the current page. +4. Enqueue links to next pages of results. +5. Open the next page in queue. + - When it's a results list page, go to 2. + - When it's a product page, scrape the data. +6. Repeat until all results pages and all products have been processed. + +`PlaywrightCrawler` will make sure to visit the pages for you, if you provide the correct requests, and you already know how to enqueue pages, so this should be fairly easy. Nevertheless, there are few more tricks that we'd like to showcase. + +## Sanity check + +Let's check that everything is set up correctly before writing the scraping logic itself. You might realize that something in your previous analysis doesn't quite add up, or the website might not behave exactly as you expected. + +The example below creates a new crawler that visits the start URL and prints the text content of all the categories on that page. When you run the code, you will see the _very badly formatted_ content of the individual category card. + +<RunnableCodeBlock className="language-python" language="python"> + {SanityCheckExample} +</RunnableCodeBlock> + +If you're wondering how to get that `.collection-block-item` selector. We'll explain it in the next chapter on DevTools. + +## DevTools - the scraper's toolbox + +:::info DevTool choice + +We'll use Chrome DevTools here, since it's the most common browser, but feel free to use any other, they're all very similar. + +::: + +Let's open DevTools by going to https://warehouse-theme-metal.myshopify.com/collections in Chrome and then right-clicking anywhere in the page and selecting **Inspect**, or by pressing **F12** or whatever your system prefers. With DevTools, you can inspect or manipulate any aspect of the currently open web page. You can learn more about DevTools in their [official documentation](https://developer.chrome.com/docs/devtools/). + +## Selecting elements + +In the DevTools, choose the **Select an element** tool and try hovering over one of the Actor cards. + +![select an element](/img/getting-started/select-an-element.jpg 'Finding the select an element tool.') + +You'll see that you can select different elements inside the card. Instead, select the whole card, not just some of its contents, such as its title or description. + +![selected element](/img/getting-started/selected-element.jpg 'Selecting an element by hovering over it.') + +Selecting an element will highlight it in the DevTools HTML inspector. When carefully look at the elements, you'll see that there are some **classes** attached to the different HTML elements. Those are called **CSS classes**, and we can make a use of them in scraping. + +Conversely, by hovering over elements in the HTML inspector, you will see them highlight on the page. Inspect the page's structure around the collection card. You'll see that all the card's data is displayed in an `<a>` element with a `class` attribute that includes **collection-block-item**. It should now make sense how we got that `.collection-block-item` selector. It's just a way to find all elements that are annotated with the `collection-block-item`. + +It's always a good idea to double-check that you're not getting any unwanted elements with this class. To do that, go into the **Console** tab of DevTools and run: + +```ts +document.querySelectorAll('.collection-block-item'); +``` + +You will see that only the 31 collection cards will be returned, and nothing else. + +:::tip Learn more about CSS selectors and DevTools + +CSS selectors and DevTools are quite a big topic. If you want to learn more, visit the [Web scraping for beginners course](https://developers.apify.com/academy/web-scraping-for-beginners) in the Apify Academy. **It's free and open-source** โค๏ธ. + +::: + +## Next steps + +Next, you will crawl the whole store, including all the listing pages and all the product detail pages. diff --git a/website/versioned_docs/version-1.6/introduction/05_crawling.mdx b/website/versioned_docs/version-1.6/introduction/05_crawling.mdx new file mode 100644 index 0000000000..7c68662766 --- /dev/null +++ b/website/versioned_docs/version-1.6/introduction/05_crawling.mdx @@ -0,0 +1,50 @@ +--- +id: crawling +title: Crawling +--- + +import ApiLink from '@site/src/components/ApiLink'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; + +import CrawlingListingExample from '!!raw-loader!roa-loader!./code_examples/05_crawling_listing.py'; +import CrawlingDetailExample from '!!raw-loader!roa-loader!./code_examples/05_crawling_detail.py'; + +To crawl the whole [Warehouse store example](https://warehouse-theme-metal.myshopify.com/collections) and find all the data, you first need to visit all the pages with products - going through all categories available and also all the product detail pages. + +## Crawling the listing pages + +In previous lessons, you used the <ApiLink to="class/EnqueueLinksFunction">`enqueue_links`</ApiLink> function like this: + +```python +await enqueue_links() +``` + +While useful in that scenario, you need something different now. Instead of finding all the `<a href="..">` elements with links to the same hostname, you need to find only the specific ones that will take your crawler to the next page of results. Otherwise, the crawler will visit a lot of other pages that you're not interested in. Using the power of DevTools and yet another <ApiLink to="class/EnqueueLinksFunction">`enqueue_links`</ApiLink> parameter, this becomes fairly easy. + +<RunnableCodeBlock className="language-python" language="python"> + {CrawlingListingExample} +</RunnableCodeBlock> + +The code should look pretty familiar to you. It's a very simple request handler where we log the currently processed URL to the console and enqueue more links. But there are also a few new, interesting additions. Let's break it down. + +### The `selector` parameter of `enqueue_links` + +When you previously used <ApiLink to="class/EnqueueLinksFunction">`enqueue_links`</ApiLink>, you were not providing any `selector` parameter, and it was fine, because you wanted to use the default value, which is `a` - finds all `<a>` elements. But now, you need to be more specific. There are multiple `<a>` links on the `Categories` page, and you're only interested in those that will take your crawler to the available list of results. Using the DevTools, you'll find that you can select the links you need using the `.collection-block-item` selector, which selects all the elements that have the `class=collection-block-item` attribute. + +### The `label` of `enqueue_links` + +You will see `label` used often throughout Crawlee, as it's a convenient way of labelling a <ApiLink to="class/Request">`Request`</ApiLink> instance for quick identification later. You can access it with `request.label` and it's a `string`. You can name your requests any way you want. Here, we used the label `CATEGORY` to note that we're enqueueing pages that represent a category of products. The <ApiLink to="class/EnqueueLinksFunction">`enqueue_links`</ApiLink> function will add this label to all requests before enqueueing them to the <ApiLink to="class/RequestQueue">`RequestQueue`</ApiLink>. Why this is useful will become obvious in a minute. + +## Crawling the detail pages + +In a similar fashion, you need to collect all the URLs to the product detail pages, because only from there you can scrape all the data you need. The following code only repeats the concepts you already know for another set of links. + +<RunnableCodeBlock className="language-python" language="python"> + {CrawlingDetailExample} +</RunnableCodeBlock> + +The crawling code is now complete. When you run the code, you'll see the crawler visit all the listing URLs and all the detail URLs. + +## Next steps + +This concludes the Crawling lesson, because you have taught the crawler to visit all the pages it needs. Let's continue with scraping data. diff --git a/website/versioned_docs/version-1.6/introduction/06_scraping.mdx b/website/versioned_docs/version-1.6/introduction/06_scraping.mdx new file mode 100644 index 0000000000..51c86e5835 --- /dev/null +++ b/website/versioned_docs/version-1.6/introduction/06_scraping.mdx @@ -0,0 +1,155 @@ +--- +id: scraping +title: Scraping +--- + +import ApiLink from '@site/src/components/ApiLink'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; + +import ScrapingExample from '!!raw-loader!roa-loader!./code_examples/06_scraping.py'; + +In the [Real-world project](./real-world-project#choosing-the-data-you-need) chapter, you've created a list of the information you wanted to collect about the products in the example Warehouse store. Let's review that and figure out ways to access the data. + +- URL +- Manufacturer +- SKU +- Title +- Current price +- Stock available + +![data to scrape](/img/getting-started/scraping-practice.jpg 'Overview of data to be scraped.') + +## Scraping the URL and manufacturer + +Some information is lying right there in front of us without even having to touch the product detail pages. The `URL` we already have - the `context.request.url`. And by looking at it carefully, we realize that we can also extract the manufacturer from the URL (as all product urls start with `/products/<manufacturer>`). We can just split the `string` and be on our way then! + +:::info url vs loaded url + +You can use `request.loaded_url` as well. Remember the difference: `request.url` is what you enqueue, `request.loaded_url` is what gets processed (after possible redirects). + +::: + +By splitting the `request.url`, we can extract the manufacturer name directly from the URL. This is done by first splitting the URL to get the product identifier and then splitting that identifier to get the manufacturer name. + +```python +# context.request.url: +# https://warehouse-theme-metal.myshopify.com/products/sennheiser-mke-440-professional-stereo-shotgun-microphone-mke-440 + +# Split the URL and get the last part. +url_part = context.request.url.split('/').pop() +# url_part: sennheiser-mke-440-professional-stereo-shotgun-microphone-mke-440 + +# Split the last part by '-' and get the first element. +manufacturer = url_part.split('-')[0] +# manufacturer: 'sennheiser' +``` + +:::tip Storing information + +It's a matter of preference, whether to store this information separately in the resulting dataset, or not. Whoever uses the dataset can easily parse the `manufacturer` from the `URL`, so should you duplicate the data unnecessarily? Our opinion is that unless the increased data consumption would be too large to bear, it's better to make the dataset as rich as possible. For example, someone might want to filter by `manufacturer`. + +::: + +:::caution Adapt and extract + +One thing you may notice is that the `manufacturer` might have a `-` in its name. If that's the case, your best bet is extracting it from the details page instead, but it's not mandatory. At the end of the day, you should always adjust and pick the best solution for your use case, and website you are crawling. + +::: + +Now it's time to add more data to the results. Let's open one of the product detail pages, for example the [Sony XBR-950G](https://warehouse-theme-metal.myshopify.com/products/sony-xbr-65x950g-65-class-64-5-diag-bravia-4k-hdr-ultra-hd-tv) page and use our DevTools-Fu ๐Ÿฅ‹ to figure out how to get the title of the product. + +## Scraping title + +To scrape the product title from a webpage, you need to identify its location in the HTML structure. By using the element selector tool in your browser's DevTools, you can see that the title is within an `<h1>` tag, which is a common practice for important headers. This `<h1>` tag is enclosed in a `<div>` with the class product-meta. We can leverage this structure to create a combined selector `.product-meta h1`. This selector targets any `<h1>` element that is a child of an element with the class `product-meta`. + +![product title](/img/getting-started/title.jpg 'Finding product title in DevTools.') + +:::tip Verifying selectors with DevTools + +Remember that you can press CTRL+F (or CMD+F on Mac) in the **Elements** tab of DevTools to open the search bar where you can quickly search for elements using their selectors. Always verify your scraping process and assumptions using the DevTools. It's faster than changing the crawler code all the time. + +::: + +To get the title, you need to locate it using Playwright with the `.product-meta h1` selector. This selector specifically targets the `<h1>` element you need. If multiple elements match, it will throw an error, which is beneficial as it prevents returning incorrect data silently. Ensuring the accuracy of your selectors is crucial for reliable data extraction. + +```python +title = await context.page.locator('.product-meta h1').text_content() +``` + +## Scraping SKU + +Using the DevTools, you can find that the product SKU is inside a `<span>` tag with the class `product-meta__sku-number`. Since there is no other `<span>` with that class on the page, you can safely use this selector to extract the SKU. + +![product sku selector](/img/getting-started/sku.jpg 'Finding product SKU in DevTools.') + +```python +# Find the SKU element using the selector and get its text content. +sku = await context.page.locator('span.product-meta__sku-number').text_content() +``` + +## Scraping current price + +Using DevTools, you can find that the current price is within a `<span>` element tagged with the `price` class. However, it is nested alongside another `<span>` element with the `visually-hidden` class. To avoid extracting the wrong text, you can filter the elements to get the correct one using the `has_text` helper. + +![product current price selector](/img/getting-started/current-price.jpg 'Finding product current price in DevTools.') + +```python +# Locate the price element and filter out the visually hidden elements. +price_element = context.page.locator('span.price', has_text='$').first + +# Extract the text content of the price element. +current_price_string = await price_element.text_content() or '' +# current_price_string: 'Sale price$1,398.00' + +# Split the string by the '$' sign to get the numeric part. +raw_price = current_price_string.split('$')[1] +# raw_price: '1,398.00' + +# Convert the raw price string to a float after removing commas. +price = float(raw_price.replace(',', '')) +# price: 1398.00 +``` + +It might look a little complex at first glance, but let's walk through what you did. First, you locate the correct part of the `price` span by filtering for elements containing the `$` sign. This ensures that you get the actual price element. Once you have the right element, you extract its text content, which gives you a string similar to `Sale price$1,398.00`. To get the numeric value, you split this string by the `$` sign. Next, you remove any commas from the resulting numeric string and convert it to a float, allowing you to work with the price as a number. This process ensures that you accurately extract and convert the current price from the product page. + +## Scraping stock availability + +The final step is to scrape the stock availability information. There is a `<span>` with the class `product-form__inventory`, which contains the text `In stock` if the product is available. You can use the `has_text` helper to filter out the correct element. + +```python +# Locate the element that contains the text 'In stock' and filter out other elements. +in_stock_element = context.page.locator( + selector='span.product-form__inventory', + has_text='In stock', +).first + +# Check if the element exists by counting the matching elements. +in_stock = await in_stock_element.count() > 0 +``` + +For this, all that matters is whether the element exists or not. You can use the `count()` method to check if any elements match the selector. If there are, it means the product is in stock. + +## Trying it out + +You have everything that is needed, so grab your newly created scraping logic, dump it into your original request handler and see the magic happen! + +<RunnableCodeBlock className="language-python" language="python"> + {ScrapingExample} +</RunnableCodeBlock> + +When you run the crawler, you will see the crawled URLs and their scraped data printed to the console. The output will look something like this: + +```json +{ + "url": "https://warehouse-theme-metal.myshopify.com/products/sony-str-za810es-7-2-channel-hi-res-wi-fi-network-av-receiver", + "manufacturer": "sony", + "title": "Sony STR-ZA810ES 7.2-Ch Hi-Res Wi-Fi Network A/V Receiver", + "sku": "SON-692802-STR-DE", + "price": 698, + "in_stock": true +} +``` + +## Next steps + +Next, you'll see how to save the data you scraped to the disk for further processing. diff --git a/website/versioned_docs/version-1.6/introduction/07_saving_data.mdx b/website/versioned_docs/version-1.6/introduction/07_saving_data.mdx new file mode 100644 index 0000000000..adddd93af9 --- /dev/null +++ b/website/versioned_docs/version-1.6/introduction/07_saving_data.mdx @@ -0,0 +1,126 @@ +--- +id: saving-data +title: Saving data +--- + +import ApiLink from '@site/src/components/ApiLink'; +import CodeBlock from '@theme/CodeBlock'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; + +import FirstCodeExample from '!!raw-loader!./code_examples/07_first_code.py'; + +import FinalCodeExample from '!!raw-loader!roa-loader!./code_examples/07_final_code.py'; + +A data extraction job would not be complete without saving the data for later use and processing. You've come to the final and most difficult part of this tutorial so make sure to pay attention very carefully! + +## Save data to the dataset + +Crawlee provides a <ApiLink to="class/Dataset">`Dataset`</ApiLink> class, which acts as an abstraction over tabular storage, making it useful for storing scraping results. To get started: + +- Add the necessary imports: Include the <ApiLink to="class/Dataset">`Dataset`</ApiLink> and any required crawler classes at the top of your file. +- Create a Dataset instance: Use the asynchronous <ApiLink to="class/Dataset#open">`Dataset.open`</ApiLink> constructor to initialize the dataset instance within your crawler's setup. + +Here's an example: + +<CodeBlock language="python"> + {FirstCodeExample} +</CodeBlock> + +Finally, instead of logging the extracted data to stdout, we can export them to the dataset: + +```python +# ... + + @crawler.router.default_handler + async def request_handler(context: PlaywrightCrawlingContext) -> None: + # ... + + data = { + 'manufacturer': manufacturer, + 'title': title, + 'sku': sku, + 'price': price, + 'in_stock': in_stock, + } + + # Push the data to the dataset. + await dataset.push_data(data) + + # ... +``` + +### Using a context helper + +Instead of importing a new class and manually creating an instance of the dataset, you can use the context helper <ApiLink to="class/PushDataFunction">`context.push_data`</ApiLink>. Remove the dataset import and instantiation, and replace `dataset.push_data` with the following: + +```python +# ... + + @crawler.router.default_handler + async def request_handler(context: PlaywrightCrawlingContext) -> None: + # ... + + data = { + 'manufacturer': manufacturer, + 'title': title, + 'sku': sku, + 'price': price, + 'in_stock': in_stock, + } + + # Push the data to the dataset. + await context.push_data(data) + + # ... +``` + +### Final code + +And that's it. Unlike earlier, we are being serious now. That's it, you're done. The final code looks like this: + +<RunnableCodeBlock className="language-python" language="python"> + {FinalCodeExample} +</RunnableCodeBlock> + +## What `push_data` does? + +A helper <ApiLink to="class/PushDataFunction">`context.push_data`</ApiLink> saves data to the default dataset. You can provide additional arguments there like `id` or `name` to open a different dataset. Dataset is a storage designed to hold data in a format similar to a table. Each time you call <ApiLink to="class/PushDataFunction">`context.push_data`</ApiLink> or direct <ApiLink to="class/Dataset#push_data">`Dataset.push_data`</ApiLink> a new row in the table is created, with the property names serving as column titles. In the default configuration, the rows are represented as JSON files saved on your file system, but other backend storage systems can be plugged into Crawlee as well. More on that later. + +:::info Automatic dataset initialization + +Each time you start Crawlee a default <ApiLink to="class/Dataset">`Dataset`</ApiLink> is automatically created, so there's no need to initialize it or create an instance first. You can create as many datasets as you want and even give them names. For more details see the <ApiLink to="class/Dataset#open">`Dataset.open`</ApiLink> function. + +::: + +{/* TODO: mention result storage guide once it's done + +:::info Automatic dataset initialization + +Each time you start Crawlee a default <ApiLink to="class/Dataset">`Dataset`</ApiLink> is automatically created, so there's no need to initialize it or create an instance first. You can create as many datasets as you want and even give them names. For more details see the [Result storage guide](../guides/result-storage#dataset) and the `Dataset.open()` function. + +::: +*/} + +## Finding saved data + +Unless you changed the configuration that Crawlee uses locally, which would suggest that you knew what you were doing, and you didn't need this tutorial anyway, you'll find your data in the storage directory that Crawlee creates in the working directory of the running script: + +```text +{PROJECT_FOLDER}/storage/datasets/default/ +``` + +The above folder will hold all your saved data in numbered files, as they were pushed into the dataset. Each file represents one invocation of <ApiLink to="class/Dataset#push_data">`Dataset.push_data`</ApiLink> or one table row. + +{/* TODO: add mention of "Result storage guide" once it's ready: + +:::tip Single file data storage options + +If you would like to store your data in a single big file, instead of many small ones, see the [Result storage guide](../guides/result-storage#key-value-store) for Key-value stores. + +::: + +*/} + +## Next steps + +Next, you'll see some improvements that you can add to your crawler code that will make it more readable and maintainable in the long run. diff --git a/website/versioned_docs/version-1.6/introduction/08_refactoring.mdx b/website/versioned_docs/version-1.6/introduction/08_refactoring.mdx new file mode 100644 index 0000000000..a194a9e839 --- /dev/null +++ b/website/versioned_docs/version-1.6/introduction/08_refactoring.mdx @@ -0,0 +1,72 @@ +--- +id: refactoring +title: Refactoring +--- + +import ApiLink from '@site/src/components/ApiLink'; +import CodeBlock from '@theme/CodeBlock'; + +import MainExample from '!!raw-loader!./code_examples/08_main.py'; +import RoutesExample from '!!raw-loader!./code_examples/08_routes.py'; + +It may seem that the data is extracted and the crawler is done, but honestly, this is just the beginning. For the sake of brevity, we've completely omitted error handling, proxies, logging, architecture, tests, documentation and other stuff that a reliable software should have. The good thing is, error handling is mostly done by Crawlee itself, so no worries on that front, unless you need some custom magic. + +:::info Navigating automatic bot-protextion avoidance + +You might be wondering about the **anti-blocking, bot-protection avoiding stealthy features** and why we haven't highlighted them yet. The reason is straightforward: these features are **automatically used** within the default configuration, providing a smooth start without manual adjustments. + +::: + +{/* TODO: add this to the info once the relevant guide is ready + +However, the default configuration, while powerful, may not cover every scenario. + +If you want to learn more, browse the [Avoid getting blocked](../guides/avoid-blocking), [Proxy management](../guides/proxy-management) and [Session management](../guides/session-management) guides. +*/} + +To promote good coding practices, let's look at how you can use a <ApiLink to="class/Router">`Router`</ApiLink> class to better structure your crawler code. + +## Request routing + +In the following code, we've made several changes: + +- Split the code into multiple files. +- Added custom instance of <ApiLink to="class/Router">`Router`</ApiLink> to make our routing cleaner, without if clauses. +- Moved route definitions to a separate `routes.py` file. +- Simplified the `main.py` file to focus on the general structure of the crawler. + +### Routes file + +First, let's define our routes in a separate file: + +<CodeBlock className="language-python" title="src/routes.py"> + {RoutesExample} +</CodeBlock> + +### Main file + +Next, our main file becomes much simpler and cleaner: + +<CodeBlock className="language-python" title="src/main.py"> + {MainExample} +</CodeBlock> + +By structuring your code this way, you achieve better separation of concerns, making the code easier to read, manage and extend. The <ApiLink to="class/Router">`Router`</ApiLink> class keeps your routing logic clean and modular, replacing if clauses with function decorators. + +## Summary + +Refactoring your crawler code with these practices enhances readability, maintainability, and scalability. + +### Splitting your code into multiple files + +There's no reason not to split your code into multiple files and keep your logic separate. Less code in a single file means less complexity to handle at any time, which improves overall readability and maintainability. Consider further splitting the routes into separate files for even better organization. + +### Using a router to structure your crawling + +Initially, using a simple `if` / `else` statement for selecting different logic based on the crawled pages might appear more readable. However, this approach can become cumbersome with more than two types of pages, especially when the logic for each page extends over dozens or even hundreds of lines of code. + +It's good practice in any programming language to split your logic into bite-sized chunks that are easy to read and reason about. Scrolling through a thousand line long `request_handler()` where everything interacts with everything and variables can be used everywhere is not a beautiful thing to do and a pain to debug. That's why we prefer the separation of routes into their own files. + +## Next steps + +In the next and final step, you'll see how to deploy your Crawlee project to the cloud. If you used the CLI to bootstrap your project, you already have a `Dockerfile` ready, and the next section will show you how to deploy it to the [Apify platform](../deployment/apify-platform) with ease. diff --git a/website/versioned_docs/version-1.6/introduction/09_running_in_cloud.mdx b/website/versioned_docs/version-1.6/introduction/09_running_in_cloud.mdx new file mode 100644 index 0000000000..db8273f94f --- /dev/null +++ b/website/versioned_docs/version-1.6/introduction/09_running_in_cloud.mdx @@ -0,0 +1,101 @@ +--- +id: deployment +title: Running your crawler in the Cloud +sidebar_label: Running in the Cloud +description: Deploying Crawlee-python projects to the Apify platform +--- + +import CodeBlock from '@theme/CodeBlock'; +import MainExample from '!!raw-loader!./code_examples/09_apify_sdk.py'; + +## Apify platform + +Crawlee is developed by [**Apify**](https://apify.com), the web scraping and automation platform. You could say it is the **home of Crawlee projects**. In this section you'll see how to deploy the crawler there with just a few simple steps. You can deploy a **Crawlee** project wherever you want, but using the [**Apify platform**](https://console.apify.com) will give you the best experience. + +{/*In case you want to deploy your Crawlee project to other platforms, check out the [**Deployment**](../deployment) section.*/} + +With a few simple steps, you can convert your Crawlee project into a so-called **Actor**. Actors are serverless micro-apps that are easy to develop, run, share, and integrate. The infra, proxies, and storages are ready to go. [Learn more about Actors](https://apify.com/actors). + +{/*:::info Choosing between Crawlee CLI and Apify CLI for project setup + +We started this guide by using the Crawlee CLI to bootstrap the project - it offers the basic Crawlee templates, including a ready-made `Dockerfile`. If you know you will be deploying your project to the Apify platform, you might want to start with the Apify CLI instead. It also offers several project templates, and those are all set up to be used on the Apify platform right ahead. + +:::*/} + +## Dependencies + +Before we get started, you'll need to install two new dependencies: + +- [**Apify SDK**](https://pypi.org/project/apify/), a toolkit for working with the Apify platform. This will allow us to wire the storages (e.g. [`RequestQueue`](https://docs.apify.com/sdk/python/reference/class/RequestQueue) and [`Dataset`](https://docs.apify.com/sdk/python/reference/class/Dataset)) to the Apify cloud products. The Apify SDK, like Crawlee itself, is available as a PyPI package and can be installed with any Python package manager. To install it using [pip](https://pip.pypa.io/), run: + + ```sh + pip install apify + ``` + +- [**Apify CLI**](https://docs.apify.com/cli/), a command-line tool that will help us with authentication and deployment. It is a [Node.js](https://nodejs.org/) package, and can be installed using any Node.js package manager. In this guide, we will use [npm](https://npmjs.com/). We will install it globally, so you can use it across all your Crawlee and Apify projects. To install it using npm, run: + + ```sh + npm install -g apify-cli + ``` + +## Logging in to the Apify platform + +The next step will be [creating your Apify account](https://console.apify.com/sign-up). Don't worry, we have a **free tier**, so you can try things out before you buy in! Once you have that, it's time to log in with the just-installed [Apify CLI](https://docs.apify.com/cli/). You will need your personal access token, which you can find at https://console.apify.com/account#/integrations. + +```sh +apify login +``` + +## Adjusting the code + +Now that you have your account set up, you will need to adjust the code a tiny bit. We will use the [Apify SDK](https://docs.apify.com/sdk/python/), which will help us to wire the Crawlee storages (like the [`RequestQueue`](https://docs.apify.com/sdk/python/reference/class/RequestQueue)) to their Apify platform counterparts - otherwise Crawlee would keep things only in memory. + +Open your `src/main.py` file, and wrap everything in your `main` function with the [`Actor`](https://docs.apify.com/sdk/python/reference/class/Actor) context manager. Your code should look like this: + +<CodeBlock className="language-python" title="src/main.py"> + {MainExample} +</CodeBlock> + +The context manager will configure Crawlee to use the Apify API instead of its default memory storage interface. It also sets up few other things, like listening to the platform events via websockets. After the body is finished, it handles graceful shutdown. + +:::info Understanding `async with Actor` behavior with environment variables + +The [`Actor`](https://docs.apify.com/sdk/python/reference/class/Actor) context manager works conditionally based on the environment variables, namely based on the `APIFY_IS_AT_HOME` env var, which is set to `true` on the Apify platform. This means that your project will remain working the same locally, but will use the Apify API when deployed to the Apify platform. + +::: + +## Initializing the project + +You will also need to initialize the project for Apify, to do that, use the Apify CLI again: + +```sh +apify init +``` + +The CLI will check the project structure and guide you through the setup process. If prompted, follow the instructions and answer the questions to configure the project correctly. For more information follow the [Apify CLI documentation](https://docs.apify.com/cli/docs). + +This will create a folder called `.actor`, and an `actor.json` file inside it - this file contains the configuration relevant to the Apify platform, namely the Actor name, version, build tag, and few other things. Check out the [relevant documentation](https://docs.apify.com/platform/actors/development/actor-definition/actor-json) to see all the different things you can set there up. + +## Ship it! + +And that's all, your project is now ready to be published on the Apify platform. You can use the Apify CLI once more to do that: + +```sh +apify push +``` + +This command will create an archive from your project, upload it to the Apify platform and initiate a Docker build. Once finished, you will get a link to your new Actor on the platform. + +## Learning more about web scraping + +:::tip Explore Apify Academy Resources + +If you want to learn more about web scraping and browser automation, check out the [Apify Academy](https://developers.apify.com/academy). It's full of courses and tutorials on the topic. From beginner to advanced. And the best thing: **It's free and open source** โค๏ธ + +{/*If you want to do one more project, checkout our tutorial on building a [HackerNews scraper using Crawlee](https://blog.apify.com/crawlee-web-scraping-tutorial/).*/} + +::: + +## Thank you! ๐ŸŽ‰ + +That's it! Thanks for reading the whole introduction and if there's anything wrong, please ๐Ÿ™ let us know on [GitHub](https://github.com/apify/crawlee-python) or in our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping! ๐Ÿ‘‹ diff --git a/website/versioned_docs/version-1.6/introduction/code_examples/02_bs.py b/website/versioned_docs/version-1.6/introduction/code_examples/02_bs.py new file mode 100644 index 0000000000..6e5ee30069 --- /dev/null +++ b/website/versioned_docs/version-1.6/introduction/code_examples/02_bs.py @@ -0,0 +1,30 @@ +import asyncio + +# Add import of crawler and crawling context. +from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext +from crawlee.storages import RequestQueue + + +async def main() -> None: + # First you create the request queue instance. + rq = await RequestQueue.open() + + # And then you add one or more requests to it. + await rq.add_request('https://crawlee.dev') + + crawler = BeautifulSoupCrawler(request_manager=rq) + + # Define a request handler and attach it to the crawler using the decorator. + @crawler.router.default_handler + async def request_handler(context: BeautifulSoupCrawlingContext) -> None: + # Extract <title> text with BeautifulSoup. + # See BeautifulSoup documentation for API docs. + url = context.request.url + title = context.soup.title.string if context.soup.title else '' + context.log.info(f'The title of {url} is: {title}.') + + await crawler.run() + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/introduction/code_examples/02_bs_better.py b/website/versioned_docs/version-1.6/introduction/code_examples/02_bs_better.py new file mode 100644 index 0000000000..1a985722b6 --- /dev/null +++ b/website/versioned_docs/version-1.6/introduction/code_examples/02_bs_better.py @@ -0,0 +1,21 @@ +import asyncio + +# You don't need to import RequestQueue anymore. +from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext + + +async def main() -> None: + crawler = BeautifulSoupCrawler() + + @crawler.router.default_handler + async def request_handler(context: BeautifulSoupCrawlingContext) -> None: + url = context.request.url + title = context.soup.title.string if context.soup.title else '' + context.log.info(f'The title of {url} is: {title}.') + + # Start the crawler with the provided URLs. + await crawler.run(['https://crawlee.dev/']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/introduction/code_examples/02_request_queue.py b/website/versioned_docs/version-1.6/introduction/code_examples/02_request_queue.py new file mode 100644 index 0000000000..e6cc5eb8c3 --- /dev/null +++ b/website/versioned_docs/version-1.6/introduction/code_examples/02_request_queue.py @@ -0,0 +1,15 @@ +import asyncio + +from crawlee.storages import RequestQueue + + +async def main() -> None: + # First you create the request queue instance. + rq = await RequestQueue.open() + + # And then you add one or more requests to it. + await rq.add_request('https://crawlee.dev') + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/introduction/code_examples/03_enqueue_strategy.py b/website/versioned_docs/version-1.6/introduction/code_examples/03_enqueue_strategy.py new file mode 100644 index 0000000000..6aff8a1fba --- /dev/null +++ b/website/versioned_docs/version-1.6/introduction/code_examples/03_enqueue_strategy.py @@ -0,0 +1,25 @@ +import asyncio + +from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext + + +async def main() -> None: + crawler = BeautifulSoupCrawler(max_requests_per_crawl=10) + + @crawler.router.default_handler + async def request_handler(context: BeautifulSoupCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url}.') + + # See the `EnqueueStrategy` type alias for more strategy options. + # highlight-next-line + await context.enqueue_links( + # highlight-next-line + strategy='same-domain', + # highlight-next-line + ) + + await crawler.run(['https://crawlee.dev/']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/introduction/code_examples/03_finding_new_links.py b/website/versioned_docs/version-1.6/introduction/code_examples/03_finding_new_links.py new file mode 100644 index 0000000000..e25af30c13 --- /dev/null +++ b/website/versioned_docs/version-1.6/introduction/code_examples/03_finding_new_links.py @@ -0,0 +1,24 @@ +import asyncio + +from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext + + +async def main() -> None: + # Let's limit our crawls to make our tests shorter and safer. + crawler = BeautifulSoupCrawler(max_requests_per_crawl=10) + + @crawler.router.default_handler + async def request_handler(context: BeautifulSoupCrawlingContext) -> None: + url = context.request.url + title = context.soup.title.string if context.soup.title else '' + context.log.info(f'The title of {url} is: {title}.') + + # The enqueue_links function is available as one of the fields of the context. + # It is also context aware, so it does not require any parameters. + await context.enqueue_links() + + await crawler.run(['https://crawlee.dev/']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/introduction/code_examples/03_globs.py b/website/versioned_docs/version-1.6/introduction/code_examples/03_globs.py new file mode 100644 index 0000000000..c2f2627d95 --- /dev/null +++ b/website/versioned_docs/version-1.6/introduction/code_examples/03_globs.py @@ -0,0 +1,29 @@ +import asyncio + +from crawlee import Glob +from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext + + +async def main() -> None: + crawler = BeautifulSoupCrawler(max_requests_per_crawl=10) + + @crawler.router.default_handler + async def request_handler(context: BeautifulSoupCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url}.') + + # Enqueue links that match the 'include' glob pattern and + # do not match the 'exclude' glob pattern. + # highlight-next-line + await context.enqueue_links( + # highlight-next-line + include=[Glob('https://someplace.com/**/cats')], + # highlight-next-line + exclude=[Glob('https://**/archive/**')], + # highlight-next-line + ) + + await crawler.run(['https://crawlee.dev/']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/introduction/code_examples/03_original_code.py b/website/versioned_docs/version-1.6/introduction/code_examples/03_original_code.py new file mode 100644 index 0000000000..976e84d562 --- /dev/null +++ b/website/versioned_docs/version-1.6/introduction/code_examples/03_original_code.py @@ -0,0 +1,19 @@ +import asyncio + +from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext + + +async def main() -> None: + crawler = BeautifulSoupCrawler() + + @crawler.router.default_handler + async def request_handler(context: BeautifulSoupCrawlingContext) -> None: + url = context.request.url + title = context.soup.title.string if context.soup.title else '' + context.log.info(f'The title of {url} is: {title}.') + + await crawler.run(['https://crawlee.dev/']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/introduction/code_examples/03_transform_request.py b/website/versioned_docs/version-1.6/introduction/code_examples/03_transform_request.py new file mode 100644 index 0000000000..5f11a1cafa --- /dev/null +++ b/website/versioned_docs/version-1.6/introduction/code_examples/03_transform_request.py @@ -0,0 +1,49 @@ +from __future__ import annotations + +import asyncio + +from crawlee import HttpHeaders, RequestOptions, RequestTransformAction +from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext + + +def transform_request( + request_options: RequestOptions, +) -> RequestOptions | RequestTransformAction: + # Skip requests to PDF files + if request_options['url'].endswith('.pdf'): + return 'skip' + + if '/docs' in request_options['url']: + # Add custom headers to requests to specific URLs + request_options['headers'] = HttpHeaders({'Custom-Header': 'value'}) + + elif '/blog' in request_options['url']: + # Add label for certain URLs + request_options['label'] = 'BLOG' + + else: + # Signal that the request should proceed without any transformation + return 'unchanged' + + return request_options + + +async def main() -> None: + crawler = BeautifulSoupCrawler(max_requests_per_crawl=10) + + @crawler.router.default_handler + async def request_handler(context: BeautifulSoupCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url}.') + + # Transform request before enqueueing + await context.enqueue_links(transform_request_function=transform_request) + + @crawler.router.handler('BLOG') + async def blog_handler(context: BeautifulSoupCrawlingContext) -> None: + context.log.info(f'Blog Processing {context.request.url}.') + + await crawler.run(['https://crawlee.dev/']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/introduction/code_examples/04_sanity_check.py b/website/versioned_docs/version-1.6/introduction/code_examples/04_sanity_check.py new file mode 100644 index 0000000000..5bfbccd27e --- /dev/null +++ b/website/versioned_docs/version-1.6/introduction/code_examples/04_sanity_check.py @@ -0,0 +1,32 @@ +import asyncio + +# Instead of BeautifulSoupCrawler let's use Playwright to be able to render JavaScript. +from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext + + +async def main() -> None: + crawler = PlaywrightCrawler() + + @crawler.router.default_handler + async def request_handler(context: PlaywrightCrawlingContext) -> None: + # Wait for the collection cards to render on the page. This ensures that + # the elements we want to interact with are present in the DOM. + await context.page.wait_for_selector('.collection-block-item') + + # Execute a function within the browser context to target the collection + # card elements and extract their text content, trimming any leading or + # trailing whitespace. + category_texts = await context.page.eval_on_selector_all( + '.collection-block-item', + '(els) => els.map(el => el.textContent.trim())', + ) + + # Log the extracted texts. + for i, text in enumerate(category_texts): + context.log.info(f'CATEGORY_{i + 1}: {text}') + + await crawler.run(['https://warehouse-theme-metal.myshopify.com/collections']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/introduction/code_examples/05_crawling_detail.py b/website/versioned_docs/version-1.6/introduction/code_examples/05_crawling_detail.py new file mode 100644 index 0000000000..a6845f23b0 --- /dev/null +++ b/website/versioned_docs/version-1.6/introduction/code_examples/05_crawling_detail.py @@ -0,0 +1,57 @@ +import asyncio + +from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext + + +async def main() -> None: + crawler = PlaywrightCrawler() + + @crawler.router.default_handler + async def request_handler(context: PlaywrightCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url}') + + # We're not processing detail pages yet, so we just pass. + if context.request.label == 'DETAIL': + pass + + # We are now on a category page. We can use this to paginate through and + # enqueue all products, as well as any subsequent pages we find. + elif context.request.label == 'CATEGORY': + # Wait for the product items to render. + await context.page.wait_for_selector('.product-item > a') + + # Enqueue links found within elements matching the provided selector. + # These links will be added to the crawling queue with the label DETAIL. + await context.enqueue_links( + selector='.product-item > a', + label='DETAIL', + ) + + # Find the "Next" button to paginate through the category pages. + next_button = await context.page.query_selector('a.pagination__next') + + # If a "Next" button is found, enqueue the next page of results. + if next_button: + await context.enqueue_links( + selector='a.pagination__next', + label='CATEGORY', + ) + + # This indicates we're on the start page with no specific label. + # On the start page, we want to enqueue all the category pages. + else: + # Wait for the collection cards to render. + await context.page.wait_for_selector('.collection-block-item') + + # Enqueue links found within elements matching the provided selector. + # These links will be added to the crawling queue with the label CATEGORY. + await context.enqueue_links( + selector='.collection-block-item', + label='CATEGORY', + ) + + await crawler.run(['https://warehouse-theme-metal.myshopify.com/collections']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/introduction/code_examples/05_crawling_listing.py b/website/versioned_docs/version-1.6/introduction/code_examples/05_crawling_listing.py new file mode 100644 index 0000000000..c9c47f57d8 --- /dev/null +++ b/website/versioned_docs/version-1.6/introduction/code_examples/05_crawling_listing.py @@ -0,0 +1,28 @@ +import asyncio + +from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext + + +async def main() -> None: + crawler = PlaywrightCrawler() + + @crawler.router.default_handler + async def request_handler(context: PlaywrightCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url}') + + # Wait for the category cards to render on the page. This ensures that + # the elements we want to interact with are present in the DOM. + await context.page.wait_for_selector('.collection-block-item') + + # Enqueue links found within elements that match the specified selector. + # These links will be added to the crawling queue with the label CATEGORY. + await context.enqueue_links( + selector='.collection-block-item', + label='CATEGORY', + ) + + await crawler.run(['https://warehouse-theme-metal.myshopify.com/collections']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/introduction/code_examples/06_scraping.py b/website/versioned_docs/version-1.6/introduction/code_examples/06_scraping.py new file mode 100644 index 0000000000..f1faf1c521 --- /dev/null +++ b/website/versioned_docs/version-1.6/introduction/code_examples/06_scraping.py @@ -0,0 +1,97 @@ +import asyncio + +from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext + + +async def main() -> None: + crawler = PlaywrightCrawler( + # Let's limit our crawls to make our tests shorter and safer. + max_requests_per_crawl=10, + ) + + @crawler.router.default_handler + async def request_handler(context: PlaywrightCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url}') + + # We're not processing detail pages yet, so we just pass. + if context.request.label == 'DETAIL': + # Split the URL and get the last part to extract the manufacturer. + url_part = context.request.url.split('/').pop() + manufacturer = url_part.split('-')[0] + + # Extract the title using the combined selector. + title = await context.page.locator('.product-meta h1').text_content() + + # Extract the SKU using its selector. + sku = await context.page.locator( + 'span.product-meta__sku-number' + ).text_content() + + # Locate the price element that contains the '$' sign and filter out + # the visually hidden elements. + price_element = context.page.locator('span.price', has_text='$').first + current_price_string = await price_element.text_content() or '' + raw_price = current_price_string.split('$')[1] + price = float(raw_price.replace(',', '')) + + # Locate the element that contains the text 'In stock' + # and filter out other elements. + in_stock_element = context.page.locator( + selector='span.product-form__inventory', + has_text='In stock', + ).first + in_stock = await in_stock_element.count() > 0 + + # Put it all together in a dictionary. + data = { + 'manufacturer': manufacturer, + 'title': title, + 'sku': sku, + 'price': price, + 'in_stock': in_stock, + } + + # Print the extracted data. + context.log.info(data) + + # We are now on a category page. We can use this to paginate through and + # enqueue all products, as well as any subsequent pages we find. + elif context.request.label == 'CATEGORY': + # Wait for the product items to render. + await context.page.wait_for_selector('.product-item > a') + + # Enqueue links found within elements matching the provided selector. + # These links will be added to the crawling queue with the label DETAIL. + await context.enqueue_links( + selector='.product-item > a', + label='DETAIL', + ) + + # Find the "Next" button to paginate through the category pages. + next_button = await context.page.query_selector('a.pagination__next') + + # If a "Next" button is found, enqueue the next page of results. + if next_button: + await context.enqueue_links( + selector='a.pagination__next', + label='CATEGORY', + ) + + # This indicates we're on the start page with no specific label. + # On the start page, we want to enqueue all the category pages. + else: + # Wait for the collection cards to render. + await context.page.wait_for_selector('.collection-block-item') + + # Enqueue links found within elements matching the provided selector. + # These links will be added to the crawling queue with the label CATEGORY. + await context.enqueue_links( + selector='.collection-block-item', + label='CATEGORY', + ) + + await crawler.run(['https://warehouse-theme-metal.myshopify.com/collections']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/introduction/code_examples/07_final_code.py b/website/versioned_docs/version-1.6/introduction/code_examples/07_final_code.py new file mode 100644 index 0000000000..a1a89167b5 --- /dev/null +++ b/website/versioned_docs/version-1.6/introduction/code_examples/07_final_code.py @@ -0,0 +1,97 @@ +import asyncio + +from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext + + +async def main() -> None: + crawler = PlaywrightCrawler( + # Let's limit our crawls to make our tests shorter and safer. + max_requests_per_crawl=10, + ) + + @crawler.router.default_handler + async def request_handler(context: PlaywrightCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url}') + + # We're not processing detail pages yet, so we just pass. + if context.request.label == 'DETAIL': + # Split the URL and get the last part to extract the manufacturer. + url_part = context.request.url.split('/').pop() + manufacturer = url_part.split('-')[0] + + # Extract the title using the combined selector. + title = await context.page.locator('.product-meta h1').text_content() + + # Extract the SKU using its selector. + sku = await context.page.locator( + 'span.product-meta__sku-number' + ).text_content() + + # Locate the price element that contains the '$' sign and filter out + # the visually hidden elements. + price_element = context.page.locator('span.price', has_text='$').first + current_price_string = await price_element.text_content() or '' + raw_price = current_price_string.split('$')[1] + price = float(raw_price.replace(',', '')) + + # Locate the element that contains the text 'In stock' and filter out + # other elements. + in_stock_element = context.page.locator( + selector='span.product-form__inventory', + has_text='In stock', + ).first + in_stock = await in_stock_element.count() > 0 + + # Put it all together in a dictionary. + data = { + 'manufacturer': manufacturer, + 'title': title, + 'sku': sku, + 'price': price, + 'in_stock': in_stock, + } + + # Push the data to the dataset. + await context.push_data(data) + + # We are now on a category page. We can use this to paginate through and + # enqueue all products, as well as any subsequent pages we find. + elif context.request.label == 'CATEGORY': + # Wait for the product items to render. + await context.page.wait_for_selector('.product-item > a') + + # Enqueue links found within elements matching the provided selector. + # These links will be added to the crawling queue with the label DETAIL. + await context.enqueue_links( + selector='.product-item > a', + label='DETAIL', + ) + + # Find the "Next" button to paginate through the category pages. + next_button = await context.page.query_selector('a.pagination__next') + + # If a "Next" button is found, enqueue the next page of results. + if next_button: + await context.enqueue_links( + selector='a.pagination__next', + label='CATEGORY', + ) + + # This indicates we're on the start page with no specific label. + # On the start page, we want to enqueue all the category pages. + else: + # Wait for the collection cards to render. + await context.page.wait_for_selector('.collection-block-item') + + # Enqueue links found within elements matching the provided selector. + # These links will be added to the crawling queue with the label CATEGORY. + await context.enqueue_links( + selector='.collection-block-item', + label='CATEGORY', + ) + + await crawler.run(['https://warehouse-theme-metal.myshopify.com/collections']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/introduction/code_examples/07_first_code.py b/website/versioned_docs/version-1.6/introduction/code_examples/07_first_code.py new file mode 100644 index 0000000000..89de967684 --- /dev/null +++ b/website/versioned_docs/version-1.6/introduction/code_examples/07_first_code.py @@ -0,0 +1,22 @@ +import asyncio + +from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext +from crawlee.storages import Dataset + +# ... + + +async def main() -> None: + crawler = PlaywrightCrawler() + dataset = await Dataset.open() + + # ... + + @crawler.router.default_handler + async def request_handler(context: PlaywrightCrawlingContext) -> None: + ... + # ... + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/introduction/code_examples/08_main.py b/website/versioned_docs/version-1.6/introduction/code_examples/08_main.py new file mode 100644 index 0000000000..09f33e3376 --- /dev/null +++ b/website/versioned_docs/version-1.6/introduction/code_examples/08_main.py @@ -0,0 +1,20 @@ +import asyncio + +from crawlee.crawlers import PlaywrightCrawler + +from .routes import router + + +async def main() -> None: + crawler = PlaywrightCrawler( + # Let's limit our crawls to make our tests shorter and safer. + max_requests_per_crawl=10, + # Provide our router instance to the crawler. + request_handler=router, + ) + + await crawler.run(['https://warehouse-theme-metal.myshopify.com/collections']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/introduction/code_examples/08_routes.py b/website/versioned_docs/version-1.6/introduction/code_examples/08_routes.py new file mode 100644 index 0000000000..58031821eb --- /dev/null +++ b/website/versioned_docs/version-1.6/introduction/code_examples/08_routes.py @@ -0,0 +1,72 @@ +from crawlee.crawlers import PlaywrightCrawlingContext +from crawlee.router import Router + +router = Router[PlaywrightCrawlingContext]() + + +@router.default_handler +async def default_handler(context: PlaywrightCrawlingContext) -> None: + # This is a fallback route which will handle the start URL. + context.log.info(f'default_handler is processing {context.request.url}') + + await context.page.wait_for_selector('.collection-block-item') + + await context.enqueue_links( + selector='.collection-block-item', + label='CATEGORY', + ) + + +@router.handler('CATEGORY') +async def category_handler(context: PlaywrightCrawlingContext) -> None: + # This replaces the context.request.label == CATEGORY branch of the if clause. + context.log.info(f'category_handler is processing {context.request.url}') + + await context.page.wait_for_selector('.product-item > a') + + await context.enqueue_links( + selector='.product-item > a', + label='DETAIL', + ) + + next_button = await context.page.query_selector('a.pagination__next') + + if next_button: + await context.enqueue_links( + selector='a.pagination__next', + label='CATEGORY', + ) + + +@router.handler('DETAIL') +async def detail_handler(context: PlaywrightCrawlingContext) -> None: + # This replaces the context.request.label == DETAIL branch of the if clause. + context.log.info(f'detail_handler is processing {context.request.url}') + + url_part = context.request.url.split('/').pop() + manufacturer = url_part.split('-')[0] + + title = await context.page.locator('.product-meta h1').text_content() + + sku = await context.page.locator('span.product-meta__sku-number').text_content() + + price_element = context.page.locator('span.price', has_text='$').first + current_price_string = await price_element.text_content() or '' + raw_price = current_price_string.split('$')[1] + price = float(raw_price.replace(',', '')) + + in_stock_element = context.page.locator( + selector='span.product-form__inventory', + has_text='In stock', + ).first + in_stock = await in_stock_element.count() > 0 + + data = { + 'manufacturer': manufacturer, + 'title': title, + 'sku': sku, + 'price': price, + 'in_stock': in_stock, + } + + await context.push_data(data) diff --git a/website/versioned_docs/version-1.6/introduction/code_examples/09_apify_sdk.py b/website/versioned_docs/version-1.6/introduction/code_examples/09_apify_sdk.py new file mode 100644 index 0000000000..fd8ceaffe7 --- /dev/null +++ b/website/versioned_docs/version-1.6/introduction/code_examples/09_apify_sdk.py @@ -0,0 +1,25 @@ +import asyncio + +# highlight-next-line +from apify import Actor + +from crawlee.crawlers import PlaywrightCrawler + +from .routes import router + + +async def main() -> None: + # highlight-next-line + async with Actor: + crawler = PlaywrightCrawler( + # Let's limit our crawls to make our tests shorter and safer. + max_requests_per_crawl=10, + # Provide our router instance to the crawler. + request_handler=router, + ) + + await crawler.run(['https://warehouse-theme-metal.myshopify.com/collections']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/introduction/code_examples/__init__.py b/website/versioned_docs/version-1.6/introduction/code_examples/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/website/versioned_docs/version-1.6/introduction/code_examples/routes.py b/website/versioned_docs/version-1.6/introduction/code_examples/routes.py new file mode 100644 index 0000000000..be20b37c81 --- /dev/null +++ b/website/versioned_docs/version-1.6/introduction/code_examples/routes.py @@ -0,0 +1,4 @@ +from crawlee.crawlers import PlaywrightCrawlingContext +from crawlee.router import Router + +router = Router[PlaywrightCrawlingContext]() diff --git a/website/versioned_docs/version-1.6/introduction/index.mdx b/website/versioned_docs/version-1.6/introduction/index.mdx new file mode 100644 index 0000000000..af37ec02c4 --- /dev/null +++ b/website/versioned_docs/version-1.6/introduction/index.mdx @@ -0,0 +1,54 @@ +--- +id: introduction +title: Introduction +--- + +import ApiLink from '@site/src/components/ApiLink'; + +Crawlee covers your crawling and scraping end-to-end and helps you **build reliable scrapers. Fast.** + +Your crawlers will appear human-like and fly under the radar of modern bot protections even with the default configuration. Crawlee gives you the tools to crawl the web for links, scrape data and persistently store it in machine-readable formats, without having to worry about the technical details. And thanks to rich configuration options, you can tweak almost any aspect of Crawlee to suit your project's needs if the default settings don't cut it. + +## What you will learn + +The goal of the introduction is to provide a step-by-step guide to the most important features of Crawlee. It will walk you through creating the simplest of crawlers that only prints text to console, all the way up to a full-featured scraper that collects links from a website and extracts data. + +## ๐Ÿ›  Features + +Why Crawlee is the preferred choice for web scraping and crawling? + +### Why use Crawlee instead of just a random HTTP library with an HTML parser? + +- Unified interface for **HTTP & headless browser** crawling. +- Automatic **parallel crawling** based on available system resources. +- Written in Python with **type hints** - enhances DX (IDE autocompletion) and reduces bugs (static type checking). +- Automatic **retries** on errors or when you are getting blocked. +- Integrated **proxy rotation** and session management. +- Configurable **request routing** - direct URLs to the appropriate handlers. +- Persistent **queue for URLs** to crawl. +- Pluggable **storage** of both tabular data and files. +- Robust **error handling**. + +### Why to use Crawlee rather than Scrapy? + +- Crawlee has out-of-the-box support for **headless browser** crawling (Playwright). +- Crawlee has a **minimalistic & elegant interface** - Set up your scraper with fewer than 10 lines of code. +- Complete **type hint** coverage. +- Based on standard **Asyncio**. + +{/* TODO: + +### ๐Ÿ‘พ HTTP crawling + +- ... +*/} + +{/* TODO: +### ๐Ÿ’ป Real browser crawling + +- ... +*/} + +## Next steps + +Next, you will install Crawlee and learn how to bootstrap projects with the prepared Crawlee templates. diff --git a/website/versioned_docs/version-1.6/pyproject.toml b/website/versioned_docs/version-1.6/pyproject.toml new file mode 100644 index 0000000000..44ad1b831e --- /dev/null +++ b/website/versioned_docs/version-1.6/pyproject.toml @@ -0,0 +1,9 @@ +# Line length different from the rest of the code to make sure that the example codes visualised on the generated +# documentation webpages are shown without vertical slider to make them more readable. + +[tool.ruff] +# Inherit all from project top configuration file. +extend = "../pyproject.toml" + +# Override just line length +line-length = 90 # Maximum possible fit to the doc webpage. Longer lines need slider. diff --git a/website/versioned_docs/version-1.6/quick-start/code_examples/beautifulsoup_crawler_example.py b/website/versioned_docs/version-1.6/quick-start/code_examples/beautifulsoup_crawler_example.py new file mode 100644 index 0000000000..2db8874c4b --- /dev/null +++ b/website/versioned_docs/version-1.6/quick-start/code_examples/beautifulsoup_crawler_example.py @@ -0,0 +1,31 @@ +import asyncio + +from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext + + +async def main() -> None: + # BeautifulSoupCrawler crawls the web using HTTP requests + # and parses HTML using the BeautifulSoup library. + crawler = BeautifulSoupCrawler(max_requests_per_crawl=10) + + # Define a request handler to process each crawled page + # and attach it to the crawler using a decorator. + @crawler.router.default_handler + async def request_handler(context: BeautifulSoupCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + # Extract relevant data from the page context. + data = { + 'url': context.request.url, + 'title': context.soup.title.string if context.soup.title else None, + } + # Store the extracted data. + await context.push_data(data) + # Extract links from the current page and add them to the crawling queue. + await context.enqueue_links() + + # Add first URL to the queue and start the crawl. + await crawler.run(['https://crawlee.dev']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/quick-start/code_examples/parsel_crawler_example.py b/website/versioned_docs/version-1.6/quick-start/code_examples/parsel_crawler_example.py new file mode 100644 index 0000000000..f8ed2a3e9c --- /dev/null +++ b/website/versioned_docs/version-1.6/quick-start/code_examples/parsel_crawler_example.py @@ -0,0 +1,31 @@ +import asyncio + +from crawlee.crawlers import ParselCrawler, ParselCrawlingContext + + +async def main() -> None: + # ParselCrawler crawls the web using HTTP requests + # and parses HTML using the Parsel library. + crawler = ParselCrawler(max_requests_per_crawl=10) + + # Define a request handler to process each crawled page + # and attach it to the crawler using a decorator. + @crawler.router.default_handler + async def request_handler(context: ParselCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + # Extract relevant data from the page context. + data = { + 'url': context.request.url, + 'title': context.selector.xpath('//title/text()').get(), + } + # Store the extracted data. + await context.push_data(data) + # Extract links from the current page and add them to the crawling queue. + await context.enqueue_links() + + # Add first URL to the queue and start the crawl. + await crawler.run(['https://crawlee.dev']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/quick-start/code_examples/playwright_crawler_example.py b/website/versioned_docs/version-1.6/quick-start/code_examples/playwright_crawler_example.py new file mode 100644 index 0000000000..1bc30ae320 --- /dev/null +++ b/website/versioned_docs/version-1.6/quick-start/code_examples/playwright_crawler_example.py @@ -0,0 +1,31 @@ +import asyncio + +from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext + + +async def main() -> None: + # PlaywrightCrawler crawls the web using a headless browser + # controlled by the Playwright library. + crawler = PlaywrightCrawler() + + # Define a request handler to process each crawled page + # and attach it to the crawler using a decorator. + @crawler.router.default_handler + async def request_handler(context: PlaywrightCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + # Extract relevant data from the page context. + data = { + 'url': context.request.url, + 'title': await context.page.title(), + } + # Store the extracted data. + await context.push_data(data) + # Extract links from the current page and add them to the crawling queue. + await context.enqueue_links() + + # Add first URL to the queue and start the crawl. + await crawler.run(['https://crawlee.dev']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/quick-start/code_examples/playwright_crawler_headful_example.py b/website/versioned_docs/version-1.6/quick-start/code_examples/playwright_crawler_headful_example.py new file mode 100644 index 0000000000..403c665e51 --- /dev/null +++ b/website/versioned_docs/version-1.6/quick-start/code_examples/playwright_crawler_headful_example.py @@ -0,0 +1,19 @@ +import asyncio + +from crawlee.crawlers import PlaywrightCrawler + + +async def main() -> None: + crawler = PlaywrightCrawler( + # Run with a visible browser window. + # highlight-next-line + headless=False, + # Switch to the Firefox browser. + browser_type='firefox', + ) + + # ... + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/website/versioned_docs/version-1.6/quick-start/index.mdx b/website/versioned_docs/version-1.6/quick-start/index.mdx new file mode 100644 index 0000000000..6ecd18b302 --- /dev/null +++ b/website/versioned_docs/version-1.6/quick-start/index.mdx @@ -0,0 +1,133 @@ +--- +id: quick-start +title: Quick start +--- + +import ApiLink from '@site/src/components/ApiLink'; +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +import CodeBlock from '@theme/CodeBlock'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; + +import BeautifulsoupCrawlerExample from '!!raw-loader!roa-loader!./code_examples/beautifulsoup_crawler_example.py'; +import ParselCrawlerExample from '!!raw-loader!roa-loader!./code_examples/parsel_crawler_example.py'; +import PlaywrightCrawlerExample from '!!raw-loader!roa-loader!./code_examples/playwright_crawler_example.py'; + +import PlaywrightCrawlerHeadfulExample from '!!raw-loader!./code_examples/playwright_crawler_headful_example.py'; + +This short tutorial will help you start scraping with Crawlee in just a minute or two. For an in-depth understanding of how Crawlee works, check out the [Introduction](../introduction/index.mdx) section, which provides a comprehensive step-by-step guide to creating your first scraper. + +## Choose your crawler + +Crawlee offers the following main crawler classes: <ApiLink to="class/BeautifulSoupCrawler">`BeautifulSoupCrawler`</ApiLink>, <ApiLink to="class/ParselCrawler">`ParselCrawler`</ApiLink>, and <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink>. All crawlers share the same interface, providing maximum flexibility when switching between them. + +:::caution Minimum Python version + +Crawlee requires Python 3.10 or higher. + +::: + +### BeautifulSoupCrawler + +The <ApiLink to="class/BeautifulSoupCrawler">`BeautifulSoupCrawler`</ApiLink> is a plain HTTP crawler that parses HTML using the well-known [BeautifulSoup](https://pypi.org/project/beautifulsoup4/) library. It crawls the web using an HTTP client that mimics a browser. This crawler is very fast and efficient but cannot handle JavaScript rendering. + +### ParselCrawler + +The <ApiLink to="class/ParselCrawler">`ParselCrawler`</ApiLink> is similar to the <ApiLink to="class/BeautifulSoupCrawler">`BeautifulSoupCrawler`</ApiLink> but uses the [Parsel](https://pypi.org/project/parsel/) library for HTML parsing. Parsel is a lightweight library that provides a CSS selector-based API for extracting data from HTML documents. If you are familiar with the [Scrapy](https://scrapy.org/) framework, you will feel right at home with Parsel. As with the <ApiLink to="class/BeautifulSoupCrawler">`BeautifulSoupCrawler`</ApiLink>, the <ApiLink to="class/ParselCrawler">`ParselCrawler`</ApiLink> cannot handle JavaScript rendering. + +### PlaywrightCrawler + +The <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> uses a headless browser controlled by the [Playwright](https://playwright.dev/) library. It can manage Chromium, Firefox, Webkit, and other browsers. Playwright is the successor to the [Puppeteer](https://pptr.dev/) library and is becoming the de facto standard in headless browser automation. If you need a headless browser, choose Playwright. + +## Installation + +Crawlee is available the [`crawlee`](https://pypi.org/project/crawlee/) package on PyPI. This package includes the core functionality, while additional features are available as optional extras to keep dependencies and package size minimal. + +You can install Crawlee with all features or choose only the ones you need. For installing it using the [pip](https://pip.pypa.io/en/stable/) package manager, run the following command: + +```sh +python -m pip install 'crawlee[all]' +``` + +Verify that Crawlee is successfully installed: + +```sh +python -c 'import crawlee; print(crawlee.__version__)' +``` + +If you plan to use the <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink>, you'll need to install Playwright dependencies, including the browser binaries. To do this, run the following command: + +```sh +playwright install +``` + +For detailed installation instructions, see the [Setting up](../introduction/01_setting_up.mdx) documentation page. + +## Crawling + +Run the following example to perform a recursive crawl of the Crawlee website using the selected crawler. + +<Tabs groupId="quickStart"> + <TabItem value="BeautifulSoupCrawler" label="BeautifulSoupCrawler" default> + <RunnableCodeBlock className="language-python" language="python"> + {BeautifulsoupCrawlerExample} + </RunnableCodeBlock> + </TabItem> + <TabItem value="ParselCrawler" label="ParselCrawler"> + <RunnableCodeBlock className="language-python" language="python"> + {ParselCrawlerExample} + </RunnableCodeBlock> + </TabItem> + <TabItem value="PlaywrightCrawler" label="PlaywrightCrawler"> + <RunnableCodeBlock className="language-python" language="python"> + {PlaywrightCrawlerExample} + </RunnableCodeBlock> + </TabItem> +</Tabs> + +When you run the example, you will see Crawlee automating the data extraction process in your terminal. + +{/* TODO: improve the logging and add here a sample */} + +## Running headful browser + +By default, browsers controlled by Playwright run in headless mode (without a visible window). However, you can configure the crawler to run in a headful mode, which is useful during the development phase to observe the browser's actions. You can also switch from the default Chromium browser to Firefox or WebKit. + +<CodeBlock language="python"> + {PlaywrightCrawlerHeadfulExample} +</CodeBlock> + +When you run the example code, you'll see an automated browser navigating through the Crawlee website. + +{/* TODO: add video example */} + +## Results + +By default, Crawlee stores data in the `./storage` directory within your current working directory. The results of your crawl will be saved as JSON files under `./storage/datasets/default/`. + +To view the results, you can use the `cat` command: + +```sh +cat ./storage/datasets/default/000000001.json +``` + +The JSON file will contain data similar to the following: + +```json +{ + "url": "https://crawlee.dev/", + "title": "Crawlee ยท Build reliable crawlers. Fast. | Crawlee" +} +``` + +:::tip + +If you want to change the storage directory, you can set the `CRAWLEE_STORAGE_DIR` environment variable to your preferred path. + +::: + +## Examples and further reading + +For more examples showcasing various features of Crawlee, visit the [Examples](/docs/examples) section of the documentation. To get a deeper understanding of Crawlee and its components, read the step-by-step [Introduction](../introduction/index.mdx) guide. + +[//]: # (TODO: add related links once they are ready) diff --git a/website/versioned_docs/version-1.6/upgrading/upgrading_to_v0x.md b/website/versioned_docs/version-1.6/upgrading/upgrading_to_v0x.md new file mode 100644 index 0000000000..d769d67d4c --- /dev/null +++ b/website/versioned_docs/version-1.6/upgrading/upgrading_to_v0x.md @@ -0,0 +1,170 @@ +--- +id: upgrading-to-v0x +title: Upgrading to v0.x +--- + +This page summarizes the breaking changes between Crawlee for Python zero-based versions. + +## Upgrading to v0.6 + +This section summarizes the breaking changes between v0.5.x and v0.6.0. + +### HttpCrawlerOptions + +- Removed `HttpCrawlerOptions` - which contained options from `BasicCrawlerOptions` and unique options `additional_http_error_status_codes` and `ignore_http_error_status_codes`. Both of the unique options were added to `BasicCrawlerOptions` instead. + +### HttpClient + +- The signature of the `HttpClient` class has been updated. The constructor parameters `additional_http_error_status_codes` and `ignore_http_error_status_codes` have been removed and are now only available in `BasicCrawlerOptions`. +- The method `_raise_for_error_status_code` has been removed from `HttpClient`. Its logic has been moved to the `BasicCrawler` class. + +### SessionCookies + +- Replaces the `dict` used for cookie storage in `Session.cookies` with a new `SessionCookies` class. `SessionCookies` uses `CookieJar`, which enables support for multiple domains. + +### PlaywrightCrawler and PlaywrightBrowserPlugin + +- `PlaywrightCrawler` now use a persistent browser context instead of the standard browser context. +- Added `user_data_dir` parameter for `PlaywrightCrawler` and `PlaywrightBrowserPlugin` to specify the directory for the persistent context. If not provided, a temporary directory will be created automatically. + +### Configuration + +The `Configuration` fields `chrome_executable_path`, `xvfb`, and `verbose_log` have been removed. The `chrome_executable_path` and `xvfb` fields were unused, while `verbose_log` can be replaced by setting `log_level` to `DEBUG`. + +### CLI dependencies + +CLI dependencies have been moved to optional dependencies. If you need the CLI, install `crawlee[cli]` + +### Abstract base classes + +We decided to move away from [Hungarian notation](https://en.wikipedia.org/wiki/Hungarian_notation) and remove all the `Base` prefixes from the abstract classes. It includes the following public classes: +- `BaseStorageClient` -> `StorageClient` +- `BaseBrowserController` -> `BrowserController` +- `BaseBrowserPlugin` -> `BrowserPlugin` + +### EnqueueStrategy + +The `EnqueueStrategy` has been changed from an enum to a string literal type. All its values and their meaning remain unchanged. + +## Upgrading to v0.5 + +This section summarizes the breaking changes between v0.4.x and v0.5.0. + +### Crawlers & CrawlingContexts + +- All crawler and crawling context classes have been consolidated into a single sub-package called `crawlers`. +- The affected classes include: `AbstractHttpCrawler`, `AbstractHttpParser`, `BasicCrawler`, `BasicCrawlerOptions`, `BasicCrawlingContext`, `BeautifulSoupCrawler`, `BeautifulSoupCrawlingContext`, `BeautifulSoupParserType`, `ContextPipeline`, `HttpCrawler`, `HttpCrawlerOptions`, `HttpCrawlingContext`, `HttpCrawlingResult`, `ParsedHttpCrawlingContext`, `ParselCrawler`, `ParselCrawlingContext`, `PlaywrightCrawler`, `PlaywrightCrawlingContext`, `PlaywrightPreNavCrawlingContext`. + +Example update: +```diff +- from crawlee.beautifulsoup_crawler import BeautifulSoupCrawler, BeautifulSoupCrawlingContext ++ from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext +``` + +### Storage clients + +- All storage client classes have been moved into a single sub-package called `storage_clients`. +- The affected classes include: `MemoryStorageClient`, `BaseStorageClient`. + +Example update: +```diff +- from crawlee.memory_storage_client import MemoryStorageClient ++ from crawlee.storage_clients import MemoryStorageClient +``` + +### CurlImpersonateHttpClient + +- The `CurlImpersonateHttpClient` changed its import location. + +Example update: +```diff +- from crawlee.http_clients.curl_impersonate import CurlImpersonateHttpClient ++ from crawlee.http_clients import CurlImpersonateHttpClient +``` + +### BeautifulSoupParser + +- Renamed `BeautifulSoupParser` to `BeautifulSoupParserType`. Probably used only in type hints. Please replace previous usages of `BeautifulSoupParser` by `BeautifulSoupParserType`. +- `BeautifulSoupParser` is now a new class that is used in refactored class `BeautifulSoupCrawler`. + +### Service locator + +- The `crawlee.service_container` was completely refactored and renamed to `crawlee.service_locator`. +- You can use it to set the configuration, event manager or storage client globally. Or you can pass them to your crawler instance directly and it will use the service locator under the hood. + +### Statistics + +- The `crawlee.statistics.Statistics` class do not accept an event manager as an input argument anymore. It uses the default, global one. +- If you want to set your custom event manager, do it either via the service locator or pass it to the crawler. + +### Request + +- The properties `json_` and `order_no` were removed. They were there only for the internal purpose of the memory storage client, you should not need them. + +### Request storages and loaders + +- The `request_provider` parameter of `BasicCrawler.__init__` has been renamed to `request_manager` +- The `BasicCrawler.get_request_provider` method has been renamed to `BasicCrawler.get_request_manager` and it does not accept the `id` and `name` arguments anymore + - If using a specific request queue is desired, pass it as the `request_manager` on `BasicCrawler` creation +- The `RequestProvider` interface has been renamed to `RequestManager` and moved to the `crawlee.request_loaders` package +- `RequestList` has been moved to the `crawlee.request_loaders` package +- `RequestList` does not support `.drop()`, `.reclaim_request()`, `.add_request()` and `add_requests_batched()` anymore + - It implements the new `RequestLoader` interface instead of `RequestManager` + - `RequestManagerTandem` with a `RequestQueue` should be used to enable passing a `RequestList` (or any other `RequestLoader` implementation) as a `request_manager`, `await list.to_tandem()` can be used as a shortcut + +### PlaywrightCrawler + +- The `PlaywrightPreNavigationContext` was renamed to `PlaywrightPreNavCrawlingContext`. +- The input arguments in `PlaywrightCrawler.__init__` have been renamed: + - `browser_options` is now `browser_launch_options`, + - `page_options` is now `browser_new_context_options`. +- These argument renaming changes have also been applied to `BrowserPool`, `PlaywrightBrowserPlugin`, and `PlaywrightBrowserController`. + +## Upgrading to v0.4 + +This section summarizes the breaking changes between v0.3.x and v0.4.0. + +### Request model + +- The `Request.query_params` field has been removed. Please add query parameters directly to the URL, which was possible before as well, and is now the only supported approach. +- The `Request.payload` and `Request.data` fields have been consolidated. Now, only `Request.payload` remains, and it should be used for all payload data in requests. + +### Extended unique key computation + +- The computation of `extended_unique_key` now includes HTTP headers. While this change impacts the behavior, the interface remains the same. + +## Upgrading to v0.3 + +This section summarizes the breaking changes between v0.2.x and v0.3.0. + +### Public and private interface declaration + +In previous versions, the majority of the package was fully public, including many elements intended for internal use only. With the release of v0.3, we have clearly defined the public and private interface of the package. As a result, some imports have been updated (see below). If you are importing something now designated as private, we recommend reconsidering its use or discussing your use case with us in the discussions/issues. + +Here is a list of the updated public imports: + +```diff +- from crawlee.enqueue_strategy import EnqueueStrategy ++ from crawlee import EnqueueStrategy +``` + +```diff +- from crawlee.models import Request ++ from crawlee import Request +``` + +```diff +- from crawlee.basic_crawler import Router ++ from crawlee.router import Router +``` + +### Request queue + +There were internal changes that should not affect the intended usage: + +- The unused `BaseRequestQueueClient.list_requests()` method was removed +- `RequestQueue` internals were updated to match the "Request Queue V2" implementation in Crawlee for JS + +### Service container + +A new module, `crawlee.service_container`, was added to allow management of "global instances" - currently it contains `Configuration`, `EventManager` and `BaseStorageClient`. The module also replaces the `StorageClientManager` static class. It is likely that its interface will change in the future. If your use case requires working with it, please get in touch - we'll be glad to hear any feedback. diff --git a/website/versioned_docs/version-1.6/upgrading/upgrading_to_v1.md b/website/versioned_docs/version-1.6/upgrading/upgrading_to_v1.md new file mode 100644 index 0000000000..7824e48887 --- /dev/null +++ b/website/versioned_docs/version-1.6/upgrading/upgrading_to_v1.md @@ -0,0 +1,339 @@ +--- +id: upgrading-to-v1 +title: Upgrading to v1 +--- + +This page summarizes the breaking changes between Crawlee for Python v0.6 and v1.0. + +## Terminology change: "browser" in different contexts + +The word "browser" is now used distinctly in two contexts: + +- **Playwright context** - Refers to Playwright-supported browsers (`chromium`, `firefox`, `webkit`, `edge`). +- **Fingerprinting context** - Refers to browsers supported by fingerprint generation (`chrome`, `firefox`, `safari`, `edge`). + +The type of `HeaderGeneratorOptions.browsers` has changed accordingly: + +**Before (v0.6):** + +```python +from crawlee.fingerprint_suite import HeaderGeneratorOptions + +HeaderGeneratorOptions(browsers=['chromium']) +HeaderGeneratorOptions(browsers=['webkit']) +``` + +**Now (v1.0):** + +```python +from crawlee.fingerprint_suite import HeaderGeneratorOptions + +HeaderGeneratorOptions(browsers=['chrome']) +HeaderGeneratorOptions(browsers=['safari']) +``` + +## New default HTTP client + +Crawlee v1.0 now uses `ImpitHttpClient` (based on [impit](https://apify.github.io/impit/) library) as the **default HTTP client**, replacing `HttpxHttpClient` (based on [httpx](https://www.python-httpx.org/) library). + +If you want to keep using `HttpxHttpClient`, install Crawlee with `httpx` extra, e.g. using pip: + +```bash +pip install 'crawlee[httpx]' +``` + +And then provide the HTTP client explicitly to the crawler: + +```python +from crawlee.crawlers import HttpCrawler +from crawlee.http_clients import HttpxHttpClient + +client = HttpxHttpClient() +crawler = HttpCrawler(http_client=client) +``` + +See the [HTTP clients guide](https://crawlee.dev/python/docs/guides/http-clients) for all options. + +## Changes in storages + +In Crawlee v1.0, the `Dataset`, `KeyValueStore`, and `RequestQueue` storage APIs have been updated for consistency and simplicity. Below is a detailed overview of what's new, what's changed, and what's been removed. + +See the [Storages guide](https://crawlee.dev/python/docs/guides/storages) for more details. + +### Dataset + +The `Dataset` API now includes several new methods, such as: + +- `get_metadata` - retrieves metadata information for the dataset. +- `purge` - completely clears the dataset, including all items (keeps the metadata only). +- `list_items` - returns the dataset's items in a list format. + +Some older methods have been removed or replaced: + +- `from_storage_object` constructor has been removed. You should now use the `open` method with either a `name` or `id` parameter. +- `get_info` method and the `storage_object` property have been replaced by the new `get_metadata` method. +- `set_metadata` method has been removed. +- `write_to_json` and `write_to_csv` methods have been removed; instead, use the `export_to` method for exporting data in different formats. + +### Key-value store + +The `KeyValueStore` API now includes several new methods, such as: + +- `get_metadata` - retrieves metadata information for the key-value store. +- `purge` - completely clears the key-value store, removing all keys and values (keeps the metadata only). +- `delete_value` - deletes a specific key and its associated value. +- `list_keys` - lists all keys in the key-value store. + +Some older methods have been removed or replaced: + +- `from_storage_object` - removed; use the `open` method with either a `name` or `id` instead. +- `get_info` and `storage_object` - replaced by the new `get_metadata` method. +- `set_metadata` method has been removed. + +### Request queue + +The `RequestQueue` API now includes several new methods, such as: + +- `get_metadata` - retrieves metadata information for the request queue. +- `purge` - completely clears the request queue, including all pending and processed requests (keeps the metadata only). +- `add_requests` - replaces the previous `add_requests_batched` method, offering the same functionality under a simpler name. + +Some older methods have been removed or replaced: + +- `from_storage_object` - removed; use the `open` method with either a `name` or `id` instead. +- `get_info` and `storage_object` - replaced by the new `get_metadata` method. +- `get_request` has argument `unique_key` instead of `request_id` as the `id` field was removed from the `Request`. +- `set_metadata` method has been removed. + +Some changes in the related model classes: + +- `resource_directory` in `RequestQueueMetadata` - removed; use the corresponding `path_to_*` property instead. +- `stats` field in `RequestQueueMetadata` - removed as it was unused. +- `RequestQueueHead` - replaced by `RequestQueueHeadWithLocks`. + +## New architecture of storage clients + +In v1.0, the storage client system has been completely reworked to simplify implementation and make custom storage clients easier to write. + +See the [Storage clients guide](https://crawlee.dev/python/docs/guides/storage-clients) for more details. + +### New dedicated storage clients + +Previously, `MemoryStorageClient` handled both in-memory storage and optional file system persistence. This has now been split into two distinct storage clients: + +- **`MemoryStorageClient`** - Stores all data in memory only. +- **`FileSystemStorageClient`** - Persists data on the file system, with in-memory caching for better performance. + +**Before (v0.6):** + +```python +from crawlee.configuration import Configuration +from crawlee.storage_clients import MemoryStorageClient + +# In-memory only +configuration = Configuration(persist_storage=False) +storage_client = MemoryStorageClient.from_config(configuration) + +# File-system persistence +configuration = Configuration(persist_storage=True) +storage_client = MemoryStorageClient.from_config(configuration) +``` + +**Now (v1.0):** + +```python +from crawlee.storage_clients import MemoryStorageClient, FileSystemStorageClient + +# In-memory only +storage_client = MemoryStorageClient() + +# File-system persistence +storage_client = FileSystemStorageClient() +``` + +### Registering a storage client + +The way you register a storage client remains unchanged: + +```python +from crawlee import service_locator +from crawlee.crawlers import ParselCrawler +from crawlee.storage_clients import MemoryStorageClient +from crawlee.storages import Dataset + +# Create custom storage client +storage_client = MemoryStorageClient() + +# Then register it globally +service_locator.set_storage_client(storage_client) + +# Or use it for a single crawler only +crawler = ParselCrawler(storage_client=storage_client) + +# Or use it for a single storage only +dataset = await Dataset.open( + name='my-dataset', + storage_client=storage_client, +) +``` + +### Instance caching + +Instance caching of `Dataset.open`, `KeyValueStore.open`, and `RequestQueue.open` now return the same instance for the same arguments. Direct calls to `StorageClient.open_*` always return new instances. + +### Writing custom storage clients + +The interface for custom storage clients has been simplified: + +- One storage client per storage type (`RequestQueue`, `KeyValueStore`, `Dataset`). +- Collection storage clients have been removed. +- The number of methods that have to be implemented have been reduced. + +## ServiceLocator changes + +### ServiceLocator is stricter with registering services +You can register the services just once, and you can no longer override already registered services. + +**Before (v0.6):** +```python +from crawlee import service_locator +from crawlee.storage_clients import MemoryStorageClient + +service_locator.set_storage_client(MemoryStorageClient()) +service_locator.set_storage_client(MemoryStorageClient()) +``` +**Now (v1.0):** + +```python +from crawlee import service_locator +from crawlee.storage_clients import MemoryStorageClient + +service_locator.set_storage_client(MemoryStorageClient()) +service_locator.set_storage_client(MemoryStorageClient()) # Raises an error +``` + +### BasicCrawler has its own instance of ServiceLocator to track its own services +Explicitly passed services to the crawler can be different the global ones accessible in `crawlee.service_locator`. `BasicCrawler` no longer causes the global services in `service_locator` to be set to the crawler's explicitly passed services. + +**Before (v0.6):** +```python +from crawlee import service_locator +from crawlee.crawlers import BasicCrawler +from crawlee.storage_clients import MemoryStorageClient +from crawlee.storages import Dataset + + +async def main() -> None: + custom_storage_client = MemoryStorageClient() + crawler = BasicCrawler(storage_client=custom_storage_client) + + assert service_locator.get_storage_client() is custom_storage_client + assert await crawler.get_dataset() is await Dataset.open() +``` +**Now (v1.0):** + +```python +from crawlee import service_locator +from crawlee.crawlers import BasicCrawler +from crawlee.storage_clients import MemoryStorageClient +from crawlee.storages import Dataset + + +async def main() -> None: + custom_storage_client = MemoryStorageClient() + crawler = BasicCrawler(storage_client=custom_storage_client) + + assert service_locator.get_storage_client() is not custom_storage_client + assert await crawler.get_dataset() is not await Dataset.open() +``` + +This allows two crawlers with different services at the same time. + +**Now (v1.0):** + +```python +from crawlee.crawlers import BasicCrawler +from crawlee.storage_clients import MemoryStorageClient, FileSystemStorageClient +from crawlee.configuration import Configuration +from crawlee.events import LocalEventManager + +custom_configuration_1 = Configuration() +custom_event_manager_1 = LocalEventManager.from_config(custom_configuration_1) +custom_storage_client_1 = MemoryStorageClient() + +custom_configuration_2 = Configuration() +custom_event_manager_2 = LocalEventManager.from_config(custom_configuration_2) +custom_storage_client_2 = FileSystemStorageClient() + +crawler_1 = BasicCrawler( + configuration=custom_configuration_1, + event_manager=custom_event_manager_1, + storage_client=custom_storage_client_1, +) + +crawler_2 = BasicCrawler( + configuration=custom_configuration_2, + event_manager=custom_event_manager_2, + storage_client=custom_storage_client_2, + ) + +# use crawlers without runtime crash... +``` + +## Other smaller updates + +There are more smaller updates. + +### Python version support + +We drop support for Python 3.9. The minimum supported version is now Python 3.10. + +### Changes in Configuration + +The fields `persist_storage` and `persist_metadata` have been removed from the `Configuration`. Persistence is now determined only by which storage client class you use. + +### Changes in Request + +`Request` objects no longer have `id` field and all its usages have been transferred to `unique_key` field. + +### Changes in HttpResponse + +The method `HttpResponse.read` is now asynchronous. This affects all HTTP-based crawlers. + +**Before (v0.6):** + +```python +from crawlee.crawlers import ParselCrawler, ParselCrawlingContext + +async def main() -> None: + crawler = ParselCrawler() + + @crawler.router.default_handler + async def request_handler(context: ParselCrawlingContext) -> None: + # highlight-next-line + content = context.http_response.read() + # ... + + await crawler.run(['https://crawlee.dev/']) +``` + +**Now (v1.0):** + +```python +from crawlee.crawlers import ParselCrawler, ParselCrawlingContext + +async def main() -> None: + crawler = ParselCrawler() + + @crawler.router.default_handler + async def request_handler(context: ParselCrawlingContext) -> None: + # highlight-next-line + content = await context.http_response.read() + # ... + + await crawler.run(['https://crawlee.dev/']) +``` + +### New storage naming restrictions + +We've introduced naming restrictions for storages to ensure compatibility with Apify Platform requirements and prevent potential conflicts. Storage names may include only letters (aโ€“z, Aโ€“Z), digits (0โ€“9), and hyphens (-), with hyphens allowed only in the middle of the name (for example, my-storage-1). diff --git a/website/versioned_sidebars/version-0.6-sidebars.json b/website/versioned_sidebars/version-0.6-sidebars.json new file mode 100644 index 0000000000..39c3ebdc08 --- /dev/null +++ b/website/versioned_sidebars/version-0.6-sidebars.json @@ -0,0 +1,109 @@ +{ + "docs": [ + "quick-start/quick-start", + { + "type": "category", + "label": "Introduction", + "collapsed": false, + "link": { + "type": "doc", + "id": "introduction/introduction" + }, + "items": [ + "introduction/setting-up", + "introduction/first-crawler", + "introduction/adding-more-urls", + "introduction/real-world-project", + "introduction/crawling", + "introduction/scraping", + "introduction/saving-data", + "introduction/refactoring", + "introduction/deployment" + ] + }, + { + "type": "category", + "label": "Guides", + "link": { + "type": "generated-index", + "title": "Guides", + "slug": "/guides", + "keywords": [ + "guides" + ] + }, + "items": [ + { + "type": "autogenerated", + "dirName": "guides" + } + ] + }, + { + "type": "category", + "label": "Deployment", + "link": { + "type": "generated-index", + "title": "Deployment guides", + "description": "Here you can find guides on how to deploy your crawlers to various cloud providers.", + "slug": "/deployment" + }, + "items": [ + { + "type": "doc", + "id": "deployment/apify-platform", + "label": "Deploy on Apify" + }, + { + "type": "category", + "label": "Deploy to Google Cloud", + "items": [ + "deployment/gcp-cloud-run-functions", + "deployment/gcp-cloud-run" + ] + } + ] + }, + { + "type": "category", + "label": "Examples", + "link": { + "type": "generated-index", + "title": "Examples", + "slug": "/examples", + "keywords": [ + "examples" + ] + }, + "items": [ + { + "type": "autogenerated", + "dirName": "examples" + } + ] + }, + { + "type": "category", + "label": "Upgrading", + "link": { + "type": "generated-index", + "title": "Upgrading", + "slug": "/upgrading", + "keywords": [ + "upgrading" + ] + }, + "items": [ + { + "type": "autogenerated", + "dirName": "upgrading" + } + ] + }, + { + "type": "doc", + "label": "Changelog", + "id": "changelog" + } + ] +} diff --git a/website/versioned_sidebars/version-1.6-sidebars.json b/website/versioned_sidebars/version-1.6-sidebars.json new file mode 100644 index 0000000000..a1abadd701 --- /dev/null +++ b/website/versioned_sidebars/version-1.6-sidebars.json @@ -0,0 +1,118 @@ +{ + "docs": [ + "quick-start/quick-start", + { + "type": "category", + "label": "Introduction", + "collapsed": false, + "link": { + "type": "doc", + "id": "introduction/introduction" + }, + "items": [ + "introduction/setting-up", + "introduction/first-crawler", + "introduction/adding-more-urls", + "introduction/real-world-project", + "introduction/crawling", + "introduction/scraping", + "introduction/saving-data", + "introduction/refactoring", + "introduction/deployment" + ] + }, + { + "type": "category", + "label": "Guides", + "collapsed": true, + "link": { + "type": "generated-index", + "title": "Guides", + "slug": "/guides", + "keywords": [ + "guides" + ] + }, + "items": [ + { + "type": "autogenerated", + "dirName": "guides" + } + ] + }, + { + "type": "category", + "label": "Deployment", + "collapsed": true, + "link": { + "type": "generated-index", + "title": "Deployment guides", + "description": "Here you can find guides on how to deploy your crawlers to various cloud providers.", + "slug": "/deployment" + }, + "items": [ + { + "type": "doc", + "id": "deployment/apify-platform", + "label": "Deploy on Apify" + }, + { + "type": "doc", + "id": "deployment/aws-lambda", + "label": "Deploy on AWS Lambda" + }, + { + "type": "category", + "label": "Deploy to Google Cloud", + "items": [ + "deployment/gcp-cloud-run-functions", + "deployment/gcp-cloud-run" + ] + } + ] + }, + { + "type": "category", + "label": "Examples", + "collapsed": true, + "link": { + "type": "generated-index", + "title": "Examples", + "slug": "/examples", + "keywords": [ + "examples" + ] + }, + "items": [ + { + "type": "autogenerated", + "dirName": "examples" + } + ] + }, + { + "type": "category", + "label": "Upgrading", + "collapsed": true, + "link": { + "type": "generated-index", + "title": "Upgrading", + "slug": "/upgrading", + "keywords": [ + "upgrading" + ] + }, + "items": [ + { + "type": "autogenerated", + "dirName": "upgrading" + } + ] + }, + { + "type": "doc", + "label": "Changelog", + "id": "changelog" + } + ] +} diff --git a/website/versions.json b/website/versions.json new file mode 100644 index 0000000000..d96f8f8741 --- /dev/null +++ b/website/versions.json @@ -0,0 +1,4 @@ +[ + "1.6", + "0.6" +]