Skip to content

Commit ce191ca

Browse files
authored
chore: Add tests for playwright utils (#1577)
### Description - Add tests for `infinite_scroll`, based on a page with a simple script for dynamic content. - Add tests for `block_requests`, based on the page with downloadable resources. ### Issues - Closes: #1568 ### Testing Add new tests
1 parent 1ae351e commit ce191ca

File tree

6 files changed

+236
-0
lines changed

6 files changed

+236
-0
lines changed

before_scroll.png

4.15 KB
Loading
Lines changed: 157 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,157 @@
1+
from playwright.async_api import async_playwright
2+
from yarl import URL
3+
4+
from crawlee.crawlers._playwright._utils import block_requests, infinite_scroll
5+
6+
7+
async def test_infinite_scroll_on_dynamic_page(server_url: URL) -> None:
8+
"""Checks that infinite_scroll loads all items on a page with infinite scrolling."""
9+
async with async_playwright() as p:
10+
browser = await p.chromium.launch(headless=True)
11+
page = await browser.new_page()
12+
13+
target_url = str(server_url / 'infinite_scroll')
14+
15+
# Get data with manual scrolling
16+
await page.goto(target_url)
17+
18+
manual_items = []
19+
for _ in range(4):
20+
items = await page.query_selector_all('.item')
21+
manual_items = items
22+
await page.evaluate('window.scrollTo(0, document.body.scrollHeight)')
23+
await page.wait_for_timeout(1000)
24+
25+
# Reset page
26+
await page.close()
27+
page = await browser.new_page()
28+
await page.goto(target_url)
29+
30+
# Get data with infinite_scroll utility
31+
before_scroll = await page.query_selector_all('.item')
32+
assert len(before_scroll) != len(manual_items)
33+
assert len(before_scroll) == 10
34+
35+
await infinite_scroll(page)
36+
37+
after_scroll = await page.query_selector_all('.item')
38+
39+
assert len(before_scroll) < len(after_scroll)
40+
assert len(manual_items) == len(after_scroll)
41+
42+
await browser.close()
43+
44+
45+
async def test_infinite_scroll_no_page_without_scroll(server_url: URL) -> None:
46+
"""Checks that infinite_scroll does not call error on a page without infinite scrolling."""
47+
async with async_playwright() as p:
48+
browser = await p.chromium.launch(headless=True)
49+
page = await browser.new_page()
50+
51+
await page.goto(str(server_url))
52+
53+
await infinite_scroll(page)
54+
55+
title = await page.title()
56+
57+
assert title == 'Hello, world!'
58+
59+
await browser.close()
60+
61+
62+
async def test_double_call_infinite_scroll(server_url: URL) -> None:
63+
"""Checks that calling infinite_scroll twice does not load more items the second time."""
64+
async with async_playwright() as p:
65+
browser = await p.chromium.launch(headless=True)
66+
page = await browser.new_page()
67+
68+
await page.goto(str(server_url / 'infinite_scroll'))
69+
70+
await infinite_scroll(page)
71+
first_count = len(await page.query_selector_all('.item'))
72+
73+
await infinite_scroll(page)
74+
second_count = len(await page.query_selector_all('.item'))
75+
76+
assert first_count == second_count
77+
78+
await browser.close()
79+
80+
81+
async def test_block_requests_default(server_url: URL) -> None:
82+
"""Checks that block_requests blocks the correct resources by default."""
83+
async with async_playwright() as p:
84+
browser = await p.chromium.launch()
85+
86+
target_url = str(server_url / 'resource_loading_page')
87+
88+
# Default behavior, all resources load
89+
page = await browser.new_page()
90+
loaded_urls_no_block = []
91+
92+
page.on('requestfinished', lambda req: loaded_urls_no_block.append(req.url.rsplit('/', 1)[-1]))
93+
await page.goto(target_url)
94+
await page.wait_for_load_state('networkidle')
95+
await page.close()
96+
97+
# With blocking — collect loaded resources
98+
page = await browser.new_page()
99+
loaded_urls_blocked = []
100+
101+
page.on('requestfinished', lambda req: loaded_urls_blocked.append(req.url.rsplit('/', 1)[-1]))
102+
await block_requests(page)
103+
await page.goto(target_url)
104+
await page.wait_for_load_state('networkidle')
105+
await page.close()
106+
107+
await browser.close()
108+
109+
# Without blocking, both resources should load
110+
assert set(loaded_urls_no_block) == {'resource_loading_page', 'test.js', 'test.png'}
111+
112+
# With blocking, only JS should load
113+
assert set(loaded_urls_blocked) == {'resource_loading_page', 'test.js'}
114+
115+
116+
async def test_block_requests_with_extra_patterns(server_url: URL) -> None:
117+
"""Checks that block_requests blocks the correct resources with extra patterns."""
118+
async with async_playwright() as p:
119+
browser = await p.chromium.launch()
120+
121+
target_url = str(server_url / 'resource_loading_page')
122+
123+
page = await browser.new_page()
124+
loaded_urls_blocked = []
125+
126+
page.on('requestfinished', lambda req: loaded_urls_blocked.append(req.url.rsplit('/', 1)[-1]))
127+
await block_requests(page, extra_url_patterns=['*.js'])
128+
await page.goto(target_url)
129+
await page.wait_for_load_state('networkidle')
130+
await page.close()
131+
132+
await browser.close()
133+
134+
# With blocking, only HTML should load
135+
assert set(loaded_urls_blocked) == {'resource_loading_page'}
136+
137+
138+
async def test_block_requests_with_custom_patterns(server_url: URL) -> None:
139+
"""Checks that block_requests blocks the correct resources with custom patterns."""
140+
async with async_playwright() as p:
141+
browser = await p.chromium.launch()
142+
143+
target_url = str(server_url / 'resource_loading_page')
144+
145+
page = await browser.new_page()
146+
loaded_urls_blocked = []
147+
148+
page.on('requestfinished', lambda req: loaded_urls_blocked.append(req.url.rsplit('/', 1)[-1]))
149+
await block_requests(page, url_patterns=['*.js'])
150+
await page.goto(target_url)
151+
await page.wait_for_load_state('networkidle')
152+
await page.close()
153+
154+
await browser.close()
155+
156+
# With blocking, only PNG should load
157+
assert set(loaded_urls_blocked) == {'resource_loading_page', 'test.png'}

tests/unit/server.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,9 @@
1818
GENERIC_RESPONSE,
1919
HELLO_WORLD,
2020
INCAPSULA,
21+
INFINITE_SCROLL,
2122
PROBLEMATIC_LINKS,
23+
RESOURCE_LOADING_PAGE,
2224
ROBOTS_TXT,
2325
SECONDARY_INDEX,
2426
START_ENQUEUE,
@@ -121,6 +123,8 @@ async def app(scope: dict[str, Any], receive: Receive, send: Send) -> None:
121123
'xml': hello_world_xml,
122124
'robots.txt': robots_txt,
123125
'get_compressed': get_compressed,
126+
'infinite_scroll': infinite_scroll_endpoint,
127+
'resource_loading_page': resource_loading_endpoint,
124128
}
125129
path = URL(scope['path']).parts[1]
126130
# Route requests to appropriate handlers
@@ -411,6 +415,22 @@ async def get_compressed(_scope: dict[str, Any], _receive: Receive, send: Send)
411415
await send({'type': 'http.response.body', 'body': gzip.compress(HELLO_WORLD * 1000)})
412416

413417

418+
async def infinite_scroll_endpoint(_scope: dict[str, Any], _receive: Receive, send: Send) -> None:
419+
"""Handle requests for the infinite scroll page."""
420+
await send_html_response(
421+
send,
422+
INFINITE_SCROLL,
423+
)
424+
425+
426+
async def resource_loading_endpoint(_scope: dict[str, Any], _receive: Receive, send: Send) -> None:
427+
"""Handle requests for the resource loading page."""
428+
await send_html_response(
429+
send,
430+
RESOURCE_LOADING_PAGE,
431+
)
432+
433+
414434
class TestServer(Server):
415435
"""A test HTTP server implementation based on Uvicorn Server."""
416436

tests/unit/server_endpoints.py

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,3 +69,62 @@
6969
7070
sitemap: http://not-exists.com/sitemap_1.xml
7171
sitemap: http://not-exists.com/sitemap_2.xml"""
72+
73+
74+
INFINITE_SCROLL = b"""\
75+
<!DOCTYPE html>
76+
<html>
77+
<body>
78+
<div id="content"></div>
79+
80+
<script>
81+
let page = 0;
82+
let loading = false;
83+
84+
for (let i = 0; i < 10; i++) {
85+
const div = document.createElement('div');
86+
div.className = 'item';
87+
div.style.height = '200px';
88+
div.textContent = 'Item ' + (i + 1);
89+
document.getElementById('content').appendChild(div);
90+
}
91+
92+
async function loadMore() {
93+
if (loading || page >= 3) return;
94+
loading = true;
95+
page++;
96+
97+
await new Promise(resolve => setTimeout(resolve, 100));
98+
99+
for (let i = 0; i < 10; i++) {
100+
const div = document.createElement('div');
101+
div.className = 'item';
102+
div.style.height = '200px';
103+
div.textContent = 'Item ' + (page * 10 + i + 1);
104+
document.getElementById('content').appendChild(div);
105+
}
106+
107+
loading = false;
108+
}
109+
110+
window.addEventListener('scroll', () => {
111+
if (window.innerHeight + window.scrollY >= document.body.offsetHeight - 100) {
112+
loadMore();
113+
}
114+
});
115+
</script>
116+
</body>
117+
</html>
118+
"""
119+
120+
RESOURCE_LOADING_PAGE = b"""\
121+
<!DOCTYPE html>
122+
<html>
123+
<head>
124+
<script src="/server_static/test.js"></script>
125+
</head>
126+
<body>
127+
<img src="/server_static/test.png" />
128+
</body>
129+
</html>
130+
"""

tests/unit/server_static/test.js

Whitespace-only changes.

tests/unit/server_static/test.png

Loading

0 commit comments

Comments
 (0)