Skip to content

Commit 5af2de3

Browse files
committed
Enhancements
- Add `wait_for_new_page`, a context manager to help with operations that open a new tab or window. - `find_element` and `find_elements` now have `waiting_time` and `ensure_visible` parameters for enhanced experience and ease of use making it similar to the `find` methods for computer-vision. - Add `wait_for_stale_element` which waits until an element is modified on the DOM. - Add `wait_for_element_visibility` which waits until an element becomes visible or invisible. - Add new `parsers` module with `table_to_dict` which allow users to extract structured data from HTML tables into a list of dictionaries. - Add `element_as_select` to ease the handling of select elements on forms. Documentation - New section `Handling Data` showcasing how you can easily extract data from web pages. - New section `Interacting with Forms` showcasing how to deal with select elements and file input elements.
1 parent d58a688 commit 5af2de3

File tree

8 files changed

+336
-21
lines changed

8 files changed

+336
-21
lines changed

botcity/web/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
from .bot import WebBot, Browser, BROWSER_CONFIGS, By # noqa: F401, F403
2+
from .parsers import table_to_dict, data_from_row, sanitize_header # noqa: F401, F403
3+
from .util import element_as_select # noqa: F401, F403
24

35
from botcity.web._version import get_versions
46
__version__ = get_versions()['version']

botcity/web/bot.py

Lines changed: 87 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
import shutil
1111
import time
1212
from typing import List
13+
from contextlib import contextmanager
1314

1415
from botcity.base import BaseBot, State
1516
from botcity.base.utils import only_if_element
@@ -21,7 +22,8 @@
2122
from selenium.webdriver.common.by import By
2223
from selenium.webdriver.common.keys import Keys
2324
from selenium.webdriver.remote.webelement import WebElement
24-
from selenium.webdriver.support.ui import WebDriverWait
25+
from selenium.webdriver.support.wait import WebDriverWait, TimeoutException, NoSuchElementException
26+
from selenium.webdriver.support import expected_conditions as EC
2527

2628
from . import config, cv2find
2729
from .browsers import BROWSER_CONFIGS, Browser
@@ -859,6 +861,26 @@ def browse(self, url):
859861
"""
860862
self.navigate_to(url)
861863

864+
@contextmanager
865+
def wait_for_new_page(self, waiting_time=10000, activate=True):
866+
"""Context manager to wait for a new page to load and activate it.
867+
868+
Args:
869+
waiting_time (int, optional): The maximum waiting time. Defaults to 10000.
870+
activate (bool, optional): Whether or not to activate the new page. Defaults to True.
871+
872+
"""
873+
tabs = self.get_tabs()
874+
yield
875+
start_time = time.time()
876+
while tabs == self.get_tabs():
877+
elapsed_time = (time.time() - start_time) * 1000
878+
if elapsed_time > waiting_time:
879+
return None
880+
time.sleep(0.1)
881+
if activate:
882+
self.activate_tab(self.get_tabs()[-1])
883+
862884
def execute_javascript(self, code):
863885
"""
864886
Execute the given javascript code.
@@ -1037,15 +1059,18 @@ def wait_for_downloads(self, timeout: int = 120000):
10371059

10381060
wait_method = BROWSER_CONFIGS.get(self.browser).get("wait_for_downloads")
10391061
# waits for all the files to be completed
1040-
WebDriverWait(self._driver, timeout/1000, 1).until(wait_method)
1062+
WebDriverWait(self._driver, timeout/1000.0, 1).until(wait_method)
10411063

1042-
def find_elements(self, selector: str, by: By = By.CSS_SELECTOR) -> List[WebElement]:
1064+
def find_elements(self, selector: str, by: By = By.CSS_SELECTOR, waiting_time=10000, ensure_visible: bool = True) -> List[WebElement]:
10431065
"""Find elements using the specified selector with selector type specified by `by`.
10441066
10451067
Args:
10461068
selector (str): The selector string to be used.
10471069
by (str, optional): Selector type. Defaults to By.CSS_SELECTOR.
10481070
[See more](https://selenium-python.readthedocs.io/api.html#selenium.webdriver.common.by.By)
1071+
waiting_time (int, optional): Maximum wait time (ms) to search for a hit.
1072+
Defaults to 10000ms (10s).
1073+
ensure_visible (bool, optional): Whether to wait for the element to be visible. Defaults to True.
10491074
10501075
Returns:
10511076
List[WebElement]: List of elements found.
@@ -1059,16 +1084,32 @@ def find_elements(self, selector: str, by: By = By.CSS_SELECTOR) -> List[WebElem
10591084
...
10601085
```
10611086
"""
1062-
return self._driver.find_elements(by, selector)
1087+
condition = EC.visibility_of_all_elements_located if ensure_visible else EC.presence_of_all_elements_located
1088+
1089+
try:
1090+
elements = WebDriverWait(
1091+
self._driver, timeout=waiting_time / 1000.0
1092+
).until(
1093+
condition((by, selector))
1094+
)
1095+
return elements
1096+
except (TimeoutException, NoSuchElementException) as ex:
1097+
print("Exception on find_elements", ex)
1098+
return None
10631099

1064-
def find_element(self, selector: str, by: str = By.CSS_SELECTOR) -> WebElement:
1100+
def find_element(self, selector: str, by: str = By.CSS_SELECTOR, waiting_time=10000, ensure_visible: bool = False, ensure_clickable: bool = False) -> WebElement:
10651101
"""Find an element using the specified selector with selector type specified by `by`.
10661102
If more than one element is found, the first instance is returned.
10671103
10681104
Args:
10691105
selector (str): The selector string to be used.
10701106
by (str, optional): Selector type. Defaults to By.CSS_SELECTOR.
10711107
[See more](https://selenium-python.readthedocs.io/api.html#selenium.webdriver.common.by.By)
1108+
waiting_time (int, optional): Maximum wait time (ms) to search for a hit.
1109+
Defaults to 10000ms (10s).
1110+
ensure_visible (bool, optional): Whether to wait for the element to be visible. Defaults to False.
1111+
ensure_clickable (bool, optional): Whether to wait for the element to be clickable. Defaults to False.
1112+
If True, `ensure_clickable` takes precedence over `ensure_visible`.
10721113
10731114
Returns:
10741115
WebElement: The element found.
@@ -1084,9 +1125,47 @@ def find_element(self, selector: str, by: str = By.CSS_SELECTOR) -> WebElement:
10841125
...
10851126
```
10861127
"""
1087-
out = self.find_elements(selector=selector, by=by)
1088-
if out:
1089-
return out[0]
1128+
condition = EC.visibility_of_element_located if ensure_visible else EC.presence_of_element_located
1129+
condition = EC.element_to_be_clickable if ensure_clickable else condition
1130+
1131+
try:
1132+
element = WebDriverWait(
1133+
self._driver, timeout=waiting_time/1000.0
1134+
).until(
1135+
condition((by, selector))
1136+
)
1137+
return element
1138+
except (TimeoutException, NoSuchElementException):
1139+
return None
1140+
1141+
def wait_for_stale_element(self, element: WebElement, timeout: int = 10000):
1142+
"""
1143+
Wait until the WebElement element becomes stale (outdated).
1144+
1145+
Args:
1146+
element (WebElement): The element to monitor for staleness.
1147+
timeout (int, optional): Timeout in millis. Defaults to 120000.
1148+
"""
1149+
try:
1150+
WebDriverWait(self._driver, timeout=timeout/1000.0).until(EC.staleness_of(element))
1151+
except (TimeoutException, NoSuchElementException):
1152+
pass
1153+
1154+
def wait_for_element_visibility(self, element: WebElement, visible: bool = True, waiting_time=10000):
1155+
"""Wait for the element to be visible or hidden.
1156+
1157+
Args:
1158+
element (WebElement): The element to wait for.
1159+
visible (bool, optional): Whether to wait for the element to be visible. Defaults to True.
1160+
waiting_time (int, optional): Maximum wait time (ms) to search for a hit.
1161+
Defaults to 10000ms (10s).
1162+
"""
1163+
if visible:
1164+
wait_method = EC.visibility_of
1165+
else:
1166+
wait_method = EC.invisibility_of_element
1167+
1168+
WebDriverWait(self._driver, timeout=waiting_time/1000.0).until(wait_method(element))
10901169

10911170
def set_file_input_element(self, element: WebElement, filepath: str):
10921171
"""Configure the filepath for upload in a file element.

botcity/web/parsers.py

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
import collections
2+
import string
3+
from typing import Dict, List
4+
from selenium.webdriver.remote.webelement import WebElement
5+
6+
7+
def data_from_row(row: WebElement, cell_tag="td") -> List[str]:
8+
"""Extract data from a row and return it as a list.
9+
10+
Args:
11+
row (WebElement): The row element.
12+
cell_tag (str, optional): The HTML tag associated with the row cells. Defaults to "td".
13+
14+
Returns:
15+
list: List of strings with the contents.
16+
"""
17+
return [
18+
col.text for col in row.find_elements_by_tag_name(cell_tag)
19+
]
20+
21+
22+
def sanitize_header(labels: List[str]):
23+
"""Sanitize header labels."""
24+
# Handle Treat Empty Header
25+
for idx, label in enumerate(labels):
26+
if label.strip():
27+
# make it lowercase
28+
label = label.lower()
29+
30+
# remove punctuations
31+
label = ''.join([l for l in label if l not in string.punctuation])
32+
33+
# replace spaces with underscores
34+
label = label.replace(" ", "_")
35+
else:
36+
label = f"col_{idx}"
37+
labels[idx] = label
38+
39+
# Deduplicate by adding _1, _2, _3 to repeated labels
40+
counts = {k: v for k, v in collections.Counter(labels).items() if v > 1}
41+
for i in reversed(range(len(labels))):
42+
item = labels[i]
43+
if item in counts and counts[item]:
44+
labels[i] = f"{item}_{counts[item]}"
45+
counts[item] -= 1
46+
47+
return labels
48+
49+
50+
def table_to_dict(table: WebElement, has_header: bool = True, skip_rows: int = 0, header_tag: str = "th") -> List[Dict]:
51+
"""Convert a table WebElement to a dict of lists.
52+
53+
Args:
54+
table (WebElement): The table element.
55+
has_header (bool, optional): Whether or not to parse a header. Defaults to True.
56+
skip_rows (int, optional): Number of rows to skip from the top. Defaults to 0.
57+
header_tag (str, optional): The HTML tag associated with the header cell. Defaults to "th".
58+
59+
Returns:
60+
list: List with dict for each row.
61+
"""
62+
63+
# Collect all rows from table
64+
rows = table.find_elements_by_tag_name("tr")
65+
66+
# Skip rows if informed
67+
if skip_rows:
68+
rows = rows[skip_rows:]
69+
70+
# Parse header labels
71+
if has_header:
72+
# Read header labels
73+
labels = data_from_row(rows[0], cell_tag=header_tag)
74+
# Sanitize headers
75+
labels = sanitize_header(labels)
76+
# Skip the header
77+
rows = rows[1:]
78+
else:
79+
# Make up header labels
80+
num_cols = len(rows[0].find_elements_by_tag_name("td"))
81+
labels = [f"col_{i}" for i in range(num_cols)]
82+
83+
# Assemble output dictionary
84+
out_list = []
85+
for row in rows:
86+
row_data = data_from_row(row)
87+
out_list.append(dict(zip(labels, row_data)))
88+
89+
return out_list

botcity/web/util.py

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,31 @@
11
import shutil
2+
import tempfile
23

4+
from selenium.webdriver.remote.webelement import WebElement
5+
from selenium.webdriver.support.select import Select
36

4-
def cleanup_temp_dir(temp_dir):
7+
8+
def cleanup_temp_dir(temp_dir: tempfile.TemporaryDirectory) -> None:
9+
"""
10+
Deletes the temporary directory and all its contents.
11+
12+
Args:
13+
temp_dir (tempfile.TemporaryDirectory): The temporary directory to delete.
14+
"""
515
if temp_dir:
616
try:
717
temp_dir.cleanup()
818
except OSError:
919
shutil.rmtree(temp_dir.name, ignore_errors=True)
20+
21+
22+
def element_as_select(element: WebElement) -> Select:
23+
"""Wraps a WebElement in a Select object.
24+
25+
Args:
26+
element (WebElement): The element to wrap.
27+
28+
Returns:
29+
Select: The Select object.
30+
"""
31+
return Select(element)

docs/forms.md

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
# Interacting with Forms
2+
3+
When dealing with forms, we often need to fill in the form and submit it.
4+
5+
While most of the operations are trivial, there are some things that are not such as selecting a select element or dealing with file uploads.
6+
7+
For that we developed some utilitary functions that you can use.
8+
9+
## Select Element
10+
11+
After grabing the element via the `find_element` or `find_elements` functions, we can use the `element_as_select` to convert it into a `Select` object.
12+
13+
::: botcity.web.util.element_as_select
14+
15+
### Example usage
16+
17+
```python
18+
# Import the function
19+
from botcity.web.util import element_as_select
20+
...
21+
# Fetch the select element
22+
element = self.find_element("select", By.TAG_NAME)
23+
# Convert the element into a Select object
24+
select_element = element_as_select(element)
25+
# Select the option based on visible text
26+
select_element.select_by_visible_text("Option 1")
27+
...
28+
```
29+
30+
## File Upload
31+
32+
After grabing the element via the `find_element` or `find_elements` functions, we can use the `set_file_input_element` to assign the file path to the element.
33+
34+
### Example usage
35+
36+
```python
37+
from botcity.web import By
38+
...
39+
# Find the input element of type `file` using CSS_SELECTOR.
40+
elem = self.find_element("body > form > input[type=file]", By.CSS_SELECTOR)
41+
# Configure the file to be used when processing the upload
42+
self.set_file_input_element(elem, "./test.txt")
43+
...
44+
```

0 commit comments

Comments
 (0)