Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 37 additions & 9 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,67 +19,95 @@ concurrency:
jobs:
test_unit:
strategy:
fail-fast: false
matrix:
python-version: [ "3.9","3.10","3.11", "3.12" ]
python-version: [ "3.11", "3.12", "3.13" ]
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: astral-sh/setup-uv@v6
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
env:
UV_LOCKED: "1"
UV_PYTHON: ${{ matrix.python-version }}
run: make install
- name: Run unit tests
env:
UV_PYTHON: ${{ matrix.python-version }}
run: |
make test-unit

lint:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: astral-sh/setup-uv@v6
- name: Set up Python 3.13
uses: actions/setup-python@v5
with:
python-version: "3.13"
- name: Install dependencies
env:
UV_LOCKED: "1"
UV_PYTHON: "3.13"
run: make install
- name: Lint
env:
UV_PYTHON: "3.13"
run: |
make lint

test_integration:
strategy:
fail-fast: false
matrix:
python-version: [ "3.9","3.10","3.11", "3.12" ]
runs-on: ubuntu-latest
python-version: [ "3.11", "3.12", "3.13" ]
runs-on: opensource-linux-8core
steps:
- uses: actions/checkout@v4
- uses: astral-sh/setup-uv@v6
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
env:
UV_LOCKED: "1"
UV_PYTHON: ${{ matrix.python-version }}
run: make install
- name: Run integration tests
run: |
make test-integration-docker
env:
UV_PYTHON: ${{ matrix.python-version }}
UNSTRUCTURED_API_KEY: ${{ secrets.UNSTRUCTURED_API_KEY }}
run: |
make test-integration-docker

test_contract:
strategy:
fail-fast: false
matrix:
python-version: [ "3.9","3.10","3.11", "3.12" ]
runs-on: ubuntu-latest
env:
POETRY_VIRTUALENVS_IN_PROJECT: "true"
python-version: [ "3.11", "3.12", "3.13" ]
runs-on: opensource-linux-8core
steps:
- uses: actions/checkout@v4
- uses: astral-sh/setup-uv@v6
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
env:
UV_LOCKED: "1"
UV_PYTHON: ${{ matrix.python-version }}
run: |
make install
- name: Run contract tests
env:
UV_PYTHON: ${{ matrix.python-version }}
run: |
make test-contract

15 changes: 7 additions & 8 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,8 @@ DOCKER_IMAGE ?= downloads.unstructured.io/unstructured-io/unstructured-api:lates
## install: installs all test, dev, and experimental requirements
.PHONY: install
install:
pip install -U poetry
python scripts/prepare_readme.py
poetry install
uv sync --locked

## install-speakeasy-cli: download the speakeasy cli tool
.PHONY: install-speakeasy-cli
Expand All @@ -28,30 +27,30 @@ test: test-unit test-integration-docker

.PHONY: test-unit
test-unit:
PYTHONPATH=. poetry run pytest -n auto _test_unstructured_client -v -k "unit"
PYTHONPATH=. uv run pytest -n auto _test_unstructured_client -v -k "unit"

.PHONY: test-contract
test-contract:
PYTHONPATH=. poetry run pytest -n auto _test_contract -v
PYTHONPATH=. uv run pytest -n auto _test_contract -v

# Assumes you have unstructured-api running on localhost:8000
.PHONY: test-integration
test-integration:
PYTHONPATH=. poetry run pytest -n auto _test_unstructured_client -v -k "integration"
PYTHONPATH=. uv run pytest -n auto _test_unstructured_client -v -k "integration"

# Runs the unstructured-api in docker for tests
.PHONY: test-integration-docker
test-integration-docker:
-docker stop unstructured-api && docker kill unstructured-api
docker run --name unstructured-api -p 8000:8000 -d --rm ${DOCKER_IMAGE} --host 0.0.0.0 && \
curl -s -o /dev/null --retry 10 --retry-delay 5 --retry-all-errors http://localhost:8000/general/docs && \
PYTHONPATH=. poetry run pytest -n auto _test_unstructured_client -v -k "integration" && \
PYTHONPATH=. uv run pytest -n auto _test_unstructured_client -v -k "integration" && \
docker kill unstructured-api

.PHONY: lint
lint:
poetry run pylint --rcfile=pylintrc src
poetry run mypy src
uv run pylint --rcfile=pylintrc src
uv run mypy src

#############
# Speakeasy #
Expand Down
3 changes: 1 addition & 2 deletions _test_contract/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,7 @@

from unstructured_client import UnstructuredClient, utils

# Python 3.9 workaround: eagerly import retries to avoid lazy import race condition
# This prevents a KeyError in module lock when templates.py triggers lazy import of utils.retries
# Eagerly import retries to avoid a lazy import race when templates.py first loads utils.retries.
from unstructured_client.utils import retries # noqa: F401

FAKE_API_KEY = "91pmLBeETAbXCpNylRsLq11FdiZPTk"
Expand Down
143 changes: 100 additions & 43 deletions _test_unstructured_client/integration/test_decorators.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from __future__ import annotations

from collections import Counter, defaultdict
import math
import tempfile
from pathlib import Path
from typing import Literal
Expand All @@ -22,6 +24,95 @@
from unstructured_client._hooks.custom import split_pdf_hook

FAKE_KEY = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
TEST_TIMEOUT_MS = 300_000

_HI_RES_STRATEGIES = ("hi_res", Strategy.HI_RES)


def _allowed_delta(expected: int, *, absolute: int, ratio: float) -> int:
return max(absolute, math.ceil(expected * ratio))


def _text_size(elements) -> int:
return sum(len((element.get("text") or "").strip()) for element in elements)


def _elements_by_page(elements):
pages = defaultdict(list)
for element in elements:
pages[element["metadata"]["page_number"]].append(element)
return pages


def _assert_hi_res_output_is_similar(resp_split, resp_single):
split_pages = _elements_by_page(resp_split.elements)
single_pages = _elements_by_page(resp_single.elements)

assert set(split_pages) == set(single_pages)

assert abs(len(resp_split.elements) - len(resp_single.elements)) <= _allowed_delta(
len(resp_single.elements),
absolute=4,
ratio=0.1,
)

split_type_counts = Counter(element["type"] for element in resp_split.elements)
single_type_counts = Counter(element["type"] for element in resp_single.elements)
assert set(split_type_counts) == set(single_type_counts)
for element_type, expected_count in single_type_counts.items():
assert abs(split_type_counts[element_type] - expected_count) <= _allowed_delta(
expected_count,
absolute=2,
ratio=0.2,
)

assert abs(_text_size(resp_split.elements) - _text_size(resp_single.elements)) <= _allowed_delta(
_text_size(resp_single.elements),
absolute=250,
ratio=0.2,
)

for page_number, single_page_elements in single_pages.items():
split_page_elements = split_pages[page_number]

assert abs(len(split_page_elements) - len(single_page_elements)) <= _allowed_delta(
len(single_page_elements),
absolute=2,
ratio=0.2,
)
assert abs(_text_size(split_page_elements) - _text_size(single_page_elements)) <= _allowed_delta(
_text_size(single_page_elements),
absolute=120,
ratio=0.3,
)


def _assert_split_unsplit_equivalent(resp_split, resp_single, strategy, extra_exclude_paths=None):
"""Compare split-PDF and single-request responses.

For hi_res (OCR-based), splitting changes per-page context so text and
OCR text can vary slightly. We still check page coverage, type distribution,
and text volume so split requests cannot silently drift too far.
For deterministic strategies (fast, etc.) we keep strict DeepDiff equality.
"""
assert resp_split.status_code == resp_single.status_code
assert resp_split.content_type == resp_single.content_type

if strategy in _HI_RES_STRATEGIES:
_assert_hi_res_output_is_similar(resp_split, resp_single)
else:
assert len(resp_split.elements) == len(resp_single.elements)

excludes = [r"root\[\d+\]\['metadata'\]\['parent_id'\]"]
if extra_exclude_paths:
excludes.extend(extra_exclude_paths)

diff = DeepDiff(
t1=resp_split.elements,
t2=resp_single.elements,
exclude_regex_paths=excludes,
)
assert len(diff) == 0


@pytest.mark.parametrize("concurrency_level", [1, 2, 5])
Expand Down Expand Up @@ -53,7 +144,7 @@ def test_integration_split_pdf_has_same_output_as_non_split(
except requests.exceptions.ConnectionError:
assert False, "The unstructured-api is not running on localhost:8000"

client = UnstructuredClient(api_key_auth=FAKE_KEY)
client = UnstructuredClient(api_key_auth=FAKE_KEY, timeout_ms=TEST_TIMEOUT_MS)

with open(filename, "rb") as f:
files = shared.Files(
Expand Down Expand Up @@ -100,18 +191,7 @@ def test_integration_split_pdf_has_same_output_as_non_split(
request=req,
)

assert len(resp_split.elements) == len(resp_single.elements)
assert resp_split.content_type == resp_single.content_type
assert resp_split.status_code == resp_single.status_code

diff = DeepDiff(
t1=resp_split.elements,
t2=resp_single.elements,
exclude_regex_paths=[
r"root\[\d+\]\['metadata'\]\['parent_id'\]",
],
)
assert len(diff) == 0
_assert_split_unsplit_equivalent(resp_split, resp_single, strategy)


@pytest.mark.parametrize(("filename", "expected_ok", "strategy"), [
Expand All @@ -136,7 +216,7 @@ def test_integration_split_pdf_with_caching(
except requests.exceptions.ConnectionError:
assert False, "The unstructured-api is not running on localhost:8000"

client = UnstructuredClient(api_key_auth=FAKE_KEY)
client = UnstructuredClient(api_key_auth=FAKE_KEY, timeout_ms=TEST_TIMEOUT_MS)

with open(filename, "rb") as f:
files = shared.Files(
Expand Down Expand Up @@ -183,19 +263,7 @@ def test_integration_split_pdf_with_caching(
request=req
)

assert len(resp_split.elements) == len(resp_single.elements)
assert resp_split.content_type == resp_single.content_type
assert resp_split.status_code == resp_single.status_code

diff = DeepDiff(
t1=resp_split.elements,
t2=resp_single.elements,
exclude_regex_paths=[
r"root\[\d+\]\['metadata'\]\['parent_id'\]",
r"root\[\d+\]\['element_id'\]",
],
)
assert len(diff) == 0
_assert_split_unsplit_equivalent(resp_split, resp_single, strategy)

# make sure the cache dir was cleaned if passed explicitly
if cache_dir:
Expand All @@ -212,7 +280,7 @@ def test_long_pages_hi_res(filename):
split_pdf_concurrency_level=15
), )

client = UnstructuredClient(api_key_auth=FAKE_KEY)
client = UnstructuredClient(api_key_auth=FAKE_KEY, timeout_ms=TEST_TIMEOUT_MS)

response = client.general.partition(
request=req,
Expand All @@ -231,7 +299,7 @@ def test_integration_split_pdf_for_file_with_no_name():
except requests.exceptions.ConnectionError:
assert False, "The unstructured-api is not running on localhost:8000"

client = UnstructuredClient(api_key_auth=FAKE_KEY)
client = UnstructuredClient(api_key_auth=FAKE_KEY, timeout_ms=TEST_TIMEOUT_MS)

with open("_sample_docs/layout-parser-paper-fast.pdf", "rb") as f:
files = shared.Files(
Expand Down Expand Up @@ -287,7 +355,7 @@ def test_integration_split_pdf_with_page_range(
except requests.exceptions.ConnectionError:
assert False, "The unstructured-api is not running on localhost:8000"

client = UnstructuredClient(api_key_auth=FAKE_KEY)
client = UnstructuredClient(api_key_auth=FAKE_KEY, timeout_ms=TEST_TIMEOUT_MS)

filename = "_sample_docs/layout-parser-paper.pdf"
with open(filename, "rb") as f:
Expand Down Expand Up @@ -351,7 +419,7 @@ def test_integration_split_pdf_strict_mode(
except requests.exceptions.ConnectionError:
assert False, "The unstructured-api is not running on localhost:8000"

client = UnstructuredClient(api_key_auth=FAKE_KEY)
client = UnstructuredClient(api_key_auth=FAKE_KEY, timeout_ms=TEST_TIMEOUT_MS)

with open(filename, "rb") as f:
files = shared.Files(
Expand Down Expand Up @@ -400,18 +468,7 @@ def test_integration_split_pdf_strict_mode(
server_url="http://localhost:8000",
)

assert len(resp_split.elements) == len(resp_single.elements)
assert resp_split.content_type == resp_single.content_type
assert resp_split.status_code == resp_single.status_code

diff = DeepDiff(
t1=resp_split.elements,
t2=resp_single.elements,
exclude_regex_paths=[
r"root\[\d+\]\['metadata'\]\['parent_id'\]",
],
)
assert len(diff) == 0
_assert_split_unsplit_equivalent(resp_split, resp_single, strategy)


@pytest.mark.asyncio
Expand Down
Loading
Loading