From 3393e435f9d9c69c31fd77394b6a634795c53520 Mon Sep 17 00:00:00 2001
From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com>
Date: Thu, 26 Feb 2026 23:55:47 +0530
Subject: [PATCH 01/12] feat: implement task dispatcher for document processing
- Introduced a TaskDispatcher abstraction to decouple the upload endpoint from Celery, allowing for easier testing with synchronous implementations.
- Updated the create_documents_file_upload function to utilize the new dispatcher for task management.
- Removed direct Celery task imports from the upload function, enhancing modularity.
- Added integration tests for document upload, including page limit enforcement and file size restrictions.
---
.../app/routes/documents_routes.py | 10 +-
.../app/services/task_dispatcher.py | 50 ++++
surfsense_backend/tests/e2e/conftest.py | 198 ------------
.../document_upload}/__init__.py | 0
.../integration/document_upload/conftest.py | 282 ++++++++++++++++++
.../document_upload}/test_document_upload.py | 70 +----
.../document_upload}/test_page_limits.py | 37 ++-
.../document_upload}/test_upload_limits.py | 7 +-
surfsense_backend/tests/utils/helpers.py | 6 +-
9 files changed, 380 insertions(+), 280 deletions(-)
create mode 100644 surfsense_backend/app/services/task_dispatcher.py
delete mode 100644 surfsense_backend/tests/e2e/conftest.py
rename surfsense_backend/tests/{e2e => integration/document_upload}/__init__.py (100%)
create mode 100644 surfsense_backend/tests/integration/document_upload/conftest.py
rename surfsense_backend/tests/{e2e => integration/document_upload}/test_document_upload.py (90%)
rename surfsense_backend/tests/{e2e => integration/document_upload}/test_page_limits.py (91%)
rename surfsense_backend/tests/{e2e => integration/document_upload}/test_upload_limits.py (96%)
diff --git a/surfsense_backend/app/routes/documents_routes.py b/surfsense_backend/app/routes/documents_routes.py
index 8217b6dac..4b04fd247 100644
--- a/surfsense_backend/app/routes/documents_routes.py
+++ b/surfsense_backend/app/routes/documents_routes.py
@@ -28,6 +28,7 @@
DocumentWithChunksRead,
PaginatedResponse,
)
+from app.services.task_dispatcher import TaskDispatcher, get_task_dispatcher
from app.users import current_active_user
from app.utils.rbac import check_permission
@@ -120,6 +121,7 @@ async def create_documents_file_upload(
search_space_id: int = Form(...),
session: AsyncSession = Depends(get_async_session),
user: User = Depends(current_active_user),
+ dispatcher: TaskDispatcher = Depends(get_task_dispatcher),
):
"""
Upload files as documents with real-time status tracking.
@@ -290,14 +292,10 @@ async def create_documents_file_upload(
for doc in created_documents:
await session.refresh(doc)
- # ===== PHASE 2: Dispatch Celery tasks for each file =====
+ # ===== PHASE 2: Dispatch tasks for each file =====
# Each task will update document status: pending → processing → ready/failed
- from app.tasks.celery_tasks.document_tasks import (
- process_file_upload_with_document_task,
- )
-
for document, temp_path, filename in files_to_process:
- process_file_upload_with_document_task.delay(
+ await dispatcher.dispatch_file_processing(
document_id=document.id,
temp_path=temp_path,
filename=filename,
diff --git a/surfsense_backend/app/services/task_dispatcher.py b/surfsense_backend/app/services/task_dispatcher.py
new file mode 100644
index 000000000..e45d601a6
--- /dev/null
+++ b/surfsense_backend/app/services/task_dispatcher.py
@@ -0,0 +1,50 @@
+"""Task dispatcher abstraction for background document processing.
+
+Decouples the upload endpoint from Celery so tests can swap in a
+synchronous (inline) implementation that needs only PostgreSQL.
+"""
+
+from __future__ import annotations
+
+from typing import Protocol
+
+
+class TaskDispatcher(Protocol):
+ async def dispatch_file_processing(
+ self,
+ *,
+ document_id: int,
+ temp_path: str,
+ filename: str,
+ search_space_id: int,
+ user_id: str,
+ ) -> None: ...
+
+
+class CeleryTaskDispatcher:
+ """Production dispatcher — fires Celery tasks via Redis broker."""
+
+ async def dispatch_file_processing(
+ self,
+ *,
+ document_id: int,
+ temp_path: str,
+ filename: str,
+ search_space_id: int,
+ user_id: str,
+ ) -> None:
+ from app.tasks.celery_tasks.document_tasks import (
+ process_file_upload_with_document_task,
+ )
+
+ process_file_upload_with_document_task.delay(
+ document_id=document_id,
+ temp_path=temp_path,
+ filename=filename,
+ search_space_id=search_space_id,
+ user_id=user_id,
+ )
+
+
+async def get_task_dispatcher() -> TaskDispatcher:
+ return CeleryTaskDispatcher()
diff --git a/surfsense_backend/tests/e2e/conftest.py b/surfsense_backend/tests/e2e/conftest.py
deleted file mode 100644
index 4c418612c..000000000
--- a/surfsense_backend/tests/e2e/conftest.py
+++ /dev/null
@@ -1,198 +0,0 @@
-"""E2e conftest — fixtures that require a running backend + database."""
-
-from __future__ import annotations
-
-from collections.abc import AsyncGenerator
-
-import asyncpg
-import httpx
-import pytest
-
-from tests.conftest import DATABASE_URL
-from tests.utils.helpers import (
- BACKEND_URL,
- TEST_EMAIL,
- auth_headers,
- delete_document,
- get_auth_token,
- get_search_space_id,
-)
-
-# ---------------------------------------------------------------------------
-# Backend connectivity fixtures
-# ---------------------------------------------------------------------------
-
-
-@pytest.fixture(scope="session")
-def backend_url() -> str:
- return BACKEND_URL
-
-
-@pytest.fixture(scope="session")
-async def auth_token(backend_url: str) -> str:
- """Authenticate once per session, registering the user if needed."""
- async with httpx.AsyncClient(base_url=backend_url, timeout=30.0) as client:
- return await get_auth_token(client)
-
-
-@pytest.fixture(scope="session")
-async def search_space_id(backend_url: str, auth_token: str) -> int:
- """Discover the first search space belonging to the test user."""
- async with httpx.AsyncClient(base_url=backend_url, timeout=30.0) as client:
- return await get_search_space_id(client, auth_token)
-
-
-@pytest.fixture(scope="session", autouse=True)
-async def _purge_test_search_space(
- search_space_id: int,
-):
- """
- Delete all documents in the test search space before the session starts.
-
- Uses direct database access to bypass the API's 409 protection on
- pending/processing documents. This ensures stuck documents from
- previous crashed runs are always cleaned up.
- """
- deleted = await _force_delete_documents_db(search_space_id)
- if deleted:
- print(
- f"\n[purge] Deleted {deleted} stale document(s) from search space {search_space_id}"
- )
- yield
-
-
-@pytest.fixture(scope="session")
-def headers(auth_token: str) -> dict[str, str]:
- """Authorization headers reused across all tests in the session."""
- return auth_headers(auth_token)
-
-
-@pytest.fixture
-async def client(backend_url: str) -> AsyncGenerator[httpx.AsyncClient]:
- """Per-test async HTTP client pointing at the running backend."""
- async with httpx.AsyncClient(base_url=backend_url, timeout=180.0) as c:
- yield c
-
-
-@pytest.fixture
-def cleanup_doc_ids() -> list[int]:
- """Accumulator for document IDs that should be deleted after the test."""
- return []
-
-
-@pytest.fixture(autouse=True)
-async def _cleanup_documents(
- client: httpx.AsyncClient,
- headers: dict[str, str],
- search_space_id: int,
- cleanup_doc_ids: list[int],
-):
- """
- Runs after every test. Tries the API first for clean deletes, then
- falls back to direct DB access for any stuck documents.
- """
- yield
-
- remaining_ids: list[int] = []
- for doc_id in cleanup_doc_ids:
- try:
- resp = await delete_document(client, headers, doc_id)
- if resp.status_code == 409:
- remaining_ids.append(doc_id)
- except Exception:
- remaining_ids.append(doc_id)
-
- if remaining_ids:
- conn = await asyncpg.connect(DATABASE_URL)
- try:
- await conn.execute(
- "DELETE FROM documents WHERE id = ANY($1::int[])",
- remaining_ids,
- )
- finally:
- await conn.close()
-
-
-# ---------------------------------------------------------------------------
-# Page-limit helpers (direct DB access)
-# ---------------------------------------------------------------------------
-
-
-async def _force_delete_documents_db(search_space_id: int) -> int:
- """
- Bypass the API and delete documents directly from the database.
-
- This handles stuck documents in pending/processing state that the API
- refuses to delete (409 Conflict). Chunks are cascade-deleted by the
- foreign key constraint.
-
- Returns the number of deleted rows.
- """
- conn = await asyncpg.connect(DATABASE_URL)
- try:
- result = await conn.execute(
- "DELETE FROM documents WHERE search_space_id = $1",
- search_space_id,
- )
- return int(result.split()[-1])
- finally:
- await conn.close()
-
-
-async def _get_user_page_usage(email: str) -> tuple[int, int]:
- """Return ``(pages_used, pages_limit)`` for the given user."""
- conn = await asyncpg.connect(DATABASE_URL)
- try:
- row = await conn.fetchrow(
- 'SELECT pages_used, pages_limit FROM "user" WHERE email = $1',
- email,
- )
- assert row is not None, f"User {email!r} not found in database"
- return row["pages_used"], row["pages_limit"]
- finally:
- await conn.close()
-
-
-async def _set_user_page_limits(
- email: str, *, pages_used: int, pages_limit: int
-) -> None:
- """Overwrite ``pages_used`` and ``pages_limit`` for the given user."""
- conn = await asyncpg.connect(DATABASE_URL)
- try:
- await conn.execute(
- 'UPDATE "user" SET pages_used = $1, pages_limit = $2 WHERE email = $3',
- pages_used,
- pages_limit,
- email,
- )
- finally:
- await conn.close()
-
-
-@pytest.fixture
-async def page_limits():
- """
- Fixture that exposes helpers for manipulating the test user's page limits.
-
- Automatically restores the original values after each test.
-
- Usage inside a test::
-
- await page_limits.set(pages_used=0, pages_limit=100)
- used, limit = await page_limits.get()
- """
-
- class _PageLimits:
- async def set(self, *, pages_used: int, pages_limit: int) -> None:
- await _set_user_page_limits(
- TEST_EMAIL, pages_used=pages_used, pages_limit=pages_limit
- )
-
- async def get(self) -> tuple[int, int]:
- return await _get_user_page_usage(TEST_EMAIL)
-
- original = await _get_user_page_usage(TEST_EMAIL)
- yield _PageLimits()
- await _set_user_page_limits(
- TEST_EMAIL, pages_used=original[0], pages_limit=original[1]
- )
diff --git a/surfsense_backend/tests/e2e/__init__.py b/surfsense_backend/tests/integration/document_upload/__init__.py
similarity index 100%
rename from surfsense_backend/tests/e2e/__init__.py
rename to surfsense_backend/tests/integration/document_upload/__init__.py
diff --git a/surfsense_backend/tests/integration/document_upload/conftest.py b/surfsense_backend/tests/integration/document_upload/conftest.py
new file mode 100644
index 000000000..325163b44
--- /dev/null
+++ b/surfsense_backend/tests/integration/document_upload/conftest.py
@@ -0,0 +1,282 @@
+"""Integration conftest — runs the FastAPI app in-process via ASGITransport.
+
+Prerequisites: PostgreSQL + pgvector only.
+
+External system boundaries are mocked:
+ - LLM summarization, text embedding, text chunking (external APIs)
+ - Redis heartbeat (external infrastructure)
+ - Task dispatch is swapped via DI (InlineTaskDispatcher)
+"""
+
+from __future__ import annotations
+
+import contextlib
+from collections.abc import AsyncGenerator
+from unittest.mock import AsyncMock, MagicMock
+
+import asyncpg
+import httpx
+import pytest
+from httpx import ASGITransport
+from sqlalchemy import text
+from sqlalchemy.ext.asyncio import create_async_engine
+from sqlalchemy.pool import NullPool
+
+from app.app import app
+from app.config import config as app_config
+from app.db import DATABASE_URL as APP_DB_URL, Base
+from app.services.task_dispatcher import get_task_dispatcher
+from tests.conftest import DATABASE_URL
+from tests.utils.helpers import (
+ TEST_EMAIL,
+ auth_headers,
+ delete_document,
+ get_auth_token,
+ get_search_space_id,
+)
+
+_EMBEDDING_DIM = app_config.embedding_model_instance.dimension
+
+pytestmark = pytest.mark.integration
+
+
+# ---------------------------------------------------------------------------
+# Inline task dispatcher (replaces Celery via DI — not a mock)
+# ---------------------------------------------------------------------------
+
+
+class InlineTaskDispatcher:
+ """Processes files synchronously in the calling coroutine.
+
+ Swapped in via FastAPI dependency_overrides so the upload endpoint
+ processes documents inline instead of dispatching to Celery.
+
+ Exceptions are caught to match Celery's fire-and-forget semantics —
+ the processing function already marks documents as failed internally.
+ """
+
+ async def dispatch_file_processing(
+ self,
+ *,
+ document_id: int,
+ temp_path: str,
+ filename: str,
+ search_space_id: int,
+ user_id: str,
+ ) -> None:
+ from app.tasks.celery_tasks.document_tasks import (
+ _process_file_with_document,
+ )
+
+ with contextlib.suppress(Exception):
+ await _process_file_with_document(
+ document_id, temp_path, filename, search_space_id, user_id
+ )
+
+
+app.dependency_overrides[get_task_dispatcher] = lambda: InlineTaskDispatcher()
+
+
+# ---------------------------------------------------------------------------
+# Database setup (ASGITransport skips the app lifespan)
+# ---------------------------------------------------------------------------
+
+
+@pytest.fixture(scope="session")
+async def _ensure_tables():
+ """Create DB tables and extensions once per session."""
+ engine = create_async_engine(APP_DB_URL, poolclass=NullPool)
+ async with engine.begin() as conn:
+ await conn.execute(text("CREATE EXTENSION IF NOT EXISTS vector"))
+ await conn.execute(text("CREATE EXTENSION IF NOT EXISTS pg_trgm"))
+ await conn.run_sync(Base.metadata.create_all)
+ await engine.dispose()
+
+
+# ---------------------------------------------------------------------------
+# Auth & search space (session-scoped, via the in-process app)
+# ---------------------------------------------------------------------------
+
+
+@pytest.fixture(scope="session")
+async def auth_token(_ensure_tables) -> str:
+ """Authenticate once per session, registering the user if needed."""
+ async with httpx.AsyncClient(
+ transport=ASGITransport(app=app), base_url="http://test", timeout=30.0
+ ) as c:
+ return await get_auth_token(c)
+
+
+@pytest.fixture(scope="session")
+async def search_space_id(auth_token: str) -> int:
+ """Discover the first search space belonging to the test user."""
+ async with httpx.AsyncClient(
+ transport=ASGITransport(app=app), base_url="http://test", timeout=30.0
+ ) as c:
+ return await get_search_space_id(c, auth_token)
+
+
+@pytest.fixture(scope="session")
+def headers(auth_token: str) -> dict[str, str]:
+ return auth_headers(auth_token)
+
+
+# ---------------------------------------------------------------------------
+# Per-test HTTP client & cleanup
+# ---------------------------------------------------------------------------
+
+
+@pytest.fixture
+async def client() -> AsyncGenerator[httpx.AsyncClient]:
+ """Per-test async HTTP client using ASGITransport (no running server)."""
+ async with httpx.AsyncClient(
+ transport=ASGITransport(app=app), base_url="http://test", timeout=180.0
+ ) as c:
+ yield c
+
+
+@pytest.fixture
+def cleanup_doc_ids() -> list[int]:
+ """Accumulator for document IDs that should be deleted after the test."""
+ return []
+
+
+@pytest.fixture(scope="session", autouse=True)
+async def _purge_test_search_space(search_space_id: int):
+ """Delete stale documents from previous runs before the session starts."""
+ conn = await asyncpg.connect(DATABASE_URL)
+ try:
+ result = await conn.execute(
+ "DELETE FROM documents WHERE search_space_id = $1",
+ search_space_id,
+ )
+ deleted = int(result.split()[-1])
+ if deleted:
+ print(
+ f"\n[purge] Deleted {deleted} stale document(s) "
+ f"from search space {search_space_id}"
+ )
+ finally:
+ await conn.close()
+ yield
+
+
+@pytest.fixture(autouse=True)
+async def _cleanup_documents(
+ client: httpx.AsyncClient,
+ headers: dict[str, str],
+ cleanup_doc_ids: list[int],
+):
+ """Delete test documents after every test (API first, DB fallback)."""
+ yield
+
+ remaining_ids: list[int] = []
+ for doc_id in cleanup_doc_ids:
+ try:
+ resp = await delete_document(client, headers, doc_id)
+ if resp.status_code == 409:
+ remaining_ids.append(doc_id)
+ except Exception:
+ remaining_ids.append(doc_id)
+
+ if remaining_ids:
+ conn = await asyncpg.connect(DATABASE_URL)
+ try:
+ await conn.execute(
+ "DELETE FROM documents WHERE id = ANY($1::int[])",
+ remaining_ids,
+ )
+ finally:
+ await conn.close()
+
+
+# ---------------------------------------------------------------------------
+# Page-limit helpers (direct DB for setup, API for verification)
+# ---------------------------------------------------------------------------
+
+
+async def _get_user_page_usage(email: str) -> tuple[int, int]:
+ conn = await asyncpg.connect(DATABASE_URL)
+ try:
+ row = await conn.fetchrow(
+ 'SELECT pages_used, pages_limit FROM "user" WHERE email = $1',
+ email,
+ )
+ assert row is not None, f"User {email!r} not found in database"
+ return row["pages_used"], row["pages_limit"]
+ finally:
+ await conn.close()
+
+
+async def _set_user_page_limits(
+ email: str, *, pages_used: int, pages_limit: int
+) -> None:
+ conn = await asyncpg.connect(DATABASE_URL)
+ try:
+ await conn.execute(
+ 'UPDATE "user" SET pages_used = $1, pages_limit = $2 WHERE email = $3',
+ pages_used,
+ pages_limit,
+ email,
+ )
+ finally:
+ await conn.close()
+
+
+@pytest.fixture
+async def page_limits():
+ """Manipulate the test user's page limits (direct DB for setup only).
+
+ Automatically restores original values after each test.
+ """
+
+ class _PageLimits:
+ async def set(self, *, pages_used: int, pages_limit: int) -> None:
+ await _set_user_page_limits(
+ TEST_EMAIL, pages_used=pages_used, pages_limit=pages_limit
+ )
+
+ original = await _get_user_page_usage(TEST_EMAIL)
+ yield _PageLimits()
+ await _set_user_page_limits(
+ TEST_EMAIL, pages_used=original[0], pages_limit=original[1]
+ )
+
+
+# ---------------------------------------------------------------------------
+# Mock external system boundaries
+# ---------------------------------------------------------------------------
+
+
+@pytest.fixture(autouse=True)
+def _mock_external_apis(monkeypatch):
+ """Mock LLM, embedding, and chunking — these are external API boundaries."""
+ monkeypatch.setattr(
+ "app.indexing_pipeline.indexing_pipeline_service.summarize_document",
+ AsyncMock(return_value="Mocked summary."),
+ )
+ monkeypatch.setattr(
+ "app.indexing_pipeline.indexing_pipeline_service.embed_text",
+ MagicMock(return_value=[0.1] * _EMBEDDING_DIM),
+ )
+ monkeypatch.setattr(
+ "app.indexing_pipeline.indexing_pipeline_service.chunk_text",
+ MagicMock(return_value=["Test chunk content."]),
+ )
+
+
+@pytest.fixture(autouse=True)
+def _mock_redis_heartbeat(monkeypatch):
+ """Mock Redis heartbeat — Redis is an external infrastructure boundary."""
+ monkeypatch.setattr(
+ "app.tasks.celery_tasks.document_tasks._start_heartbeat",
+ lambda notification_id: None,
+ )
+ monkeypatch.setattr(
+ "app.tasks.celery_tasks.document_tasks._stop_heartbeat",
+ lambda notification_id: None,
+ )
+ monkeypatch.setattr(
+ "app.tasks.celery_tasks.document_tasks._run_heartbeat_loop",
+ AsyncMock(),
+ )
diff --git a/surfsense_backend/tests/e2e/test_document_upload.py b/surfsense_backend/tests/integration/document_upload/test_document_upload.py
similarity index 90%
rename from surfsense_backend/tests/e2e/test_document_upload.py
rename to surfsense_backend/tests/integration/document_upload/test_document_upload.py
index a47b93d52..1a9433a8f 100644
--- a/surfsense_backend/tests/e2e/test_document_upload.py
+++ b/surfsense_backend/tests/integration/document_upload/test_document_upload.py
@@ -1,14 +1,14 @@
"""
-End-to-end tests for manual document upload.
+Integration tests for manual document upload.
-These tests exercise the full pipeline:
- API upload → Celery task → ETL extraction → chunking → embedding → DB storage
+These tests exercise the full pipeline via the HTTP API:
+ API upload → inline task dispatch → ETL extraction → chunking → embedding → DB storage
-Prerequisites (must be running):
- - FastAPI backend
+External boundaries mocked: LLM summarization, text embedding, text chunking,
+Redis heartbeat. Task dispatch is swapped via DI (InlineTaskDispatcher).
+
+Prerequisites:
- PostgreSQL + pgvector
- - Redis
- - Celery worker
"""
from __future__ import annotations
@@ -28,7 +28,7 @@
upload_multiple_files,
)
-pytestmark = pytest.mark.e2e
+pytestmark = pytest.mark.integration
# ---------------------------------------------------------------------------
# Helpers local to this module
@@ -45,7 +45,7 @@ def _assert_document_ready(doc: dict, *, expected_filename: str) -> None:
# ---------------------------------------------------------------------------
-# Test A: Upload a .txt file (direct read path — no ETL service needed)
+# Test A: Upload a .txt file (direct read path)
# ---------------------------------------------------------------------------
@@ -108,7 +108,6 @@ async def test_txt_document_fields_populated(
doc = await get_document(client, headers, doc_ids[0])
_assert_document_ready(doc, expected_filename="sample.txt")
- assert doc["document_metadata"]["ETL_SERVICE"] == "MARKDOWN"
# ---------------------------------------------------------------------------
@@ -158,11 +157,10 @@ async def test_md_document_fields_populated(
doc = await get_document(client, headers, doc_ids[0])
_assert_document_ready(doc, expected_filename="sample.md")
- assert doc["document_metadata"]["ETL_SERVICE"] == "MARKDOWN"
# ---------------------------------------------------------------------------
-# Test C: Upload a .pdf file (ETL path — Docling / Unstructured)
+# Test C: Upload a .pdf file (ETL path)
# ---------------------------------------------------------------------------
@@ -208,11 +206,6 @@ async def test_pdf_document_fields_populated(
doc = await get_document(client, headers, doc_ids[0])
_assert_document_ready(doc, expected_filename="sample.pdf")
- assert doc["document_metadata"]["ETL_SERVICE"] in {
- "DOCLING",
- "UNSTRUCTURED",
- "LLAMACLOUD",
- }
# ---------------------------------------------------------------------------
@@ -284,7 +277,6 @@ async def test_duplicate_file_is_skipped(
search_space_id: int,
cleanup_doc_ids: list[int],
):
- # First upload
resp1 = await upload_file(
client, headers, "sample.txt", search_space_id=search_space_id
)
@@ -296,7 +288,6 @@ async def test_duplicate_file_is_skipped(
client, headers, first_ids, search_space_id=search_space_id
)
- # Second upload of the same file
resp2 = await upload_file(
client, headers, "sample.txt", search_space_id=search_space_id
)
@@ -327,7 +318,6 @@ async def test_same_content_different_name_detected(
cleanup_doc_ids: list[int],
tmp_path: Path,
):
- # First upload
resp1 = await upload_file(
client, headers, "sample.txt", search_space_id=search_space_id
)
@@ -338,7 +328,6 @@ async def test_same_content_different_name_detected(
client, headers, first_ids, search_space_id=search_space_id
)
- # Copy fixture content to a differently named temp file
src = FIXTURES_DIR / "sample.txt"
dest = tmp_path / "renamed_sample.txt"
shutil.copy2(src, dest)
@@ -477,39 +466,7 @@ async def test_delete_processed_document(
# ---------------------------------------------------------------------------
-# Test K: Cannot delete a document while it is still processing
-# ---------------------------------------------------------------------------
-
-
-class TestDeleteWhileProcessing:
- """Attempting to delete a pending/processing document should be rejected."""
-
- async def test_delete_pending_document_returns_409(
- self,
- client: httpx.AsyncClient,
- headers: dict[str, str],
- search_space_id: int,
- cleanup_doc_ids: list[int],
- ):
- resp = await upload_file(
- client, headers, "sample.pdf", search_space_id=search_space_id
- )
- assert resp.status_code == 200
- doc_ids = resp.json()["document_ids"]
- cleanup_doc_ids.extend(doc_ids)
-
- # Immediately try to delete before processing finishes
- del_resp = await delete_document(client, headers, doc_ids[0])
- assert del_resp.status_code == 409
-
- # Let it finish so cleanup can work
- await poll_document_status(
- client, headers, doc_ids, search_space_id=search_space_id, timeout=300.0
- )
-
-
-# ---------------------------------------------------------------------------
-# Test L: Status polling returns correct structure
+# Test K: Searchability after upload
# ---------------------------------------------------------------------------
@@ -547,6 +504,11 @@ async def test_uploaded_document_appears_in_search(
)
+# ---------------------------------------------------------------------------
+# Test L: Status polling returns correct structure
+# ---------------------------------------------------------------------------
+
+
class TestStatusPolling:
"""Verify the status endpoint returns well-formed responses."""
diff --git a/surfsense_backend/tests/e2e/test_page_limits.py b/surfsense_backend/tests/integration/document_upload/test_page_limits.py
similarity index 91%
rename from surfsense_backend/tests/e2e/test_page_limits.py
rename to surfsense_backend/tests/integration/document_upload/test_page_limits.py
index 092772ceb..985fd7128 100644
--- a/surfsense_backend/tests/e2e/test_page_limits.py
+++ b/surfsense_backend/tests/integration/document_upload/test_page_limits.py
@@ -1,23 +1,20 @@
"""
-End-to-end tests for page-limit enforcement during document upload.
+Integration tests for page-limit enforcement during document upload.
These tests manipulate the test user's ``pages_used`` / ``pages_limit``
-columns directly in the database and then exercise the upload pipeline to
-verify that:
+columns directly in the database (setup only) and then exercise the upload
+pipeline to verify that:
- Uploads are rejected *before* ETL when the limit is exhausted.
- - ``pages_used`` increases after a successful upload.
+ - ``pages_used`` increases after a successful upload (verified via API).
- A ``page_limit_exceeded`` notification is created on rejection.
- ``pages_used`` is not modified when a document fails processing.
All tests reuse the existing small fixtures (``sample.pdf``, ``sample.txt``)
so no additional processing time is introduced.
-Prerequisites (must be running):
- - FastAPI backend
+Prerequisites:
- PostgreSQL + pgvector
- - Redis
- - Celery worker
"""
from __future__ import annotations
@@ -31,7 +28,21 @@
upload_file,
)
-pytestmark = pytest.mark.e2e
+pytestmark = pytest.mark.integration
+
+
+# ---------------------------------------------------------------------------
+# Helper: read pages_used through the public API
+# ---------------------------------------------------------------------------
+
+
+async def _get_pages_used(client: httpx.AsyncClient, headers: dict[str, str]) -> int:
+ """Fetch the current user's pages_used via the /users/me API."""
+ resp = await client.get("/users/me", headers=headers)
+ assert resp.status_code == 200, (
+ f"GET /users/me failed ({resp.status_code}): {resp.text}"
+ )
+ return resp.json()["pages_used"]
# ---------------------------------------------------------------------------
@@ -65,7 +76,7 @@ async def test_pages_used_increases_after_pdf_upload(
for did in doc_ids:
assert statuses[did]["status"]["state"] == "ready"
- used, _ = await page_limits.get()
+ used = await _get_pages_used(client, headers)
assert used > 0, "pages_used should have increased after successful processing"
@@ -128,7 +139,7 @@ async def test_pages_used_unchanged_after_limit_rejection(
client, headers, doc_ids, search_space_id=search_space_id, timeout=300.0
)
- used, _ = await page_limits.get()
+ used = await _get_pages_used(client, headers)
assert used == 50, (
f"pages_used should remain 50 after rejected upload, got {used}"
)
@@ -263,7 +274,7 @@ async def test_pages_used_stable_on_etl_failure(
for did in doc_ids:
assert statuses[did]["status"]["state"] == "failed"
- used, _ = await page_limits.get()
+ used = await _get_pages_used(client, headers)
assert used == 10, f"pages_used should remain 10 after ETL failure, got {used}"
@@ -284,7 +295,6 @@ async def test_second_upload_rejected_after_quota_consumed(
cleanup_doc_ids: list[int],
page_limits,
):
- # Give just enough room for one ~1-page PDF
await page_limits.set(pages_used=0, pages_limit=1)
resp1 = await upload_file(
@@ -300,7 +310,6 @@ async def test_second_upload_rejected_after_quota_consumed(
for did in first_ids:
assert statuses1[did]["status"]["state"] == "ready"
- # Second upload — should fail because quota is now consumed
resp2 = await upload_file(
client,
headers,
diff --git a/surfsense_backend/tests/e2e/test_upload_limits.py b/surfsense_backend/tests/integration/document_upload/test_upload_limits.py
similarity index 96%
rename from surfsense_backend/tests/e2e/test_upload_limits.py
rename to surfsense_backend/tests/integration/document_upload/test_upload_limits.py
index 9368b0f43..a8dab43f0 100644
--- a/surfsense_backend/tests/e2e/test_upload_limits.py
+++ b/surfsense_backend/tests/integration/document_upload/test_upload_limits.py
@@ -1,5 +1,5 @@
"""
-End-to-end tests for backend file upload limit enforcement.
+Integration tests for backend file upload limit enforcement.
These tests verify that the API rejects uploads that exceed:
- Max files per upload (10)
@@ -9,8 +9,7 @@
The limits mirror the frontend's DocumentUploadTab.tsx constants and are
enforced server-side to protect against direct API calls.
-Prerequisites (must be running):
- - FastAPI backend
+Prerequisites:
- PostgreSQL + pgvector
"""
@@ -21,7 +20,7 @@
import httpx
import pytest
-pytestmark = pytest.mark.e2e
+pytestmark = pytest.mark.integration
# ---------------------------------------------------------------------------
diff --git a/surfsense_backend/tests/utils/helpers.py b/surfsense_backend/tests/utils/helpers.py
index 270db7125..c5719a253 100644
--- a/surfsense_backend/tests/utils/helpers.py
+++ b/surfsense_backend/tests/utils/helpers.py
@@ -3,16 +3,14 @@
from __future__ import annotations
import asyncio
-import os
from pathlib import Path
import httpx
FIXTURES_DIR = Path(__file__).resolve().parent.parent / "fixtures"
-BACKEND_URL = os.environ.get("TEST_BACKEND_URL", "http://localhost:8000")
-TEST_EMAIL = os.environ.get("TEST_USER_EMAIL", "testuser@surfsense.com")
-TEST_PASSWORD = os.environ.get("TEST_USER_PASSWORD", "testpassword123")
+TEST_EMAIL = "testuser@surfsense.com"
+TEST_PASSWORD = "testpassword123"
async def get_auth_token(client: httpx.AsyncClient) -> str:
From 87711ee381cbe5ed21e7e977503232cd96ce3588 Mon Sep 17 00:00:00 2001
From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com>
Date: Thu, 26 Feb 2026 23:56:01 +0530
Subject: [PATCH 02/12] chore: clean up .env.example and pyproject.toml
- Removed commented-out testing configuration from .env.example to streamline the file.
- Updated markers in pyproject.toml to remove the e2e test marker, clarifying the purpose of the remaining markers.
---
surfsense_backend/.env.example | 9 ---------
surfsense_backend/pyproject.toml | 3 +--
2 files changed, 1 insertion(+), 11 deletions(-)
diff --git a/surfsense_backend/.env.example b/surfsense_backend/.env.example
index 1aff5f3d9..0af368081 100644
--- a/surfsense_backend/.env.example
+++ b/surfsense_backend/.env.example
@@ -176,12 +176,3 @@ DAYTONA_API_URL=https://app.daytona.io/api
DAYTONA_TARGET=us
# Directory for locally-persisted sandbox files (after sandbox deletion)
SANDBOX_FILES_DIR=sandbox_files
-
-
-# ============================================================
-# Testing (optional — all have sensible defaults)
-# ============================================================
-# TEST_BACKEND_URL=http://localhost:8000
-# TEST_DATABASE_URL=postgresql+asyncpg://postgres:postgres@localhost:5432/surfsense
-# TEST_USER_EMAIL=testuser@surfsense.com
-# TEST_USER_PASSWORD=testpassword123
diff --git a/surfsense_backend/pyproject.toml b/surfsense_backend/pyproject.toml
index 36dc4e7c1..08775b31f 100644
--- a/surfsense_backend/pyproject.toml
+++ b/surfsense_backend/pyproject.toml
@@ -178,8 +178,7 @@ python_functions = ["test_*"]
addopts = "-v --tb=short -x --strict-markers -ra --durations=5"
markers = [
"unit: pure logic tests, no DB or external services",
- "integration: tests that require a real PostgreSQL database",
- "e2e: tests requiring a running backend and real HTTP calls"
+ "integration: tests that require a real PostgreSQL database"
]
filterwarnings = [
"ignore::UserWarning:chonkie",
From 223c2de0d2d166bcffa672d3b536c09f7db0df37 Mon Sep 17 00:00:00 2001
From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com>
Date: Fri, 27 Feb 2026 00:05:21 +0530
Subject: [PATCH 03/12] refactor: update database connection handling in test
configurations
---
surfsense_backend/tests/conftest.py | 16 +++++-----------
.../integration/document_upload/conftest.py | 15 ++++++++-------
2 files changed, 13 insertions(+), 18 deletions(-)
diff --git a/surfsense_backend/tests/conftest.py b/surfsense_backend/tests/conftest.py
index 2c8e81fa3..19349affb 100644
--- a/surfsense_backend/tests/conftest.py
+++ b/surfsense_backend/tests/conftest.py
@@ -3,23 +3,17 @@
from __future__ import annotations
import os
-from pathlib import Path
+
+os.environ.setdefault(
+ "DATABASE_URL",
+ "postgresql+asyncpg://postgres:postgres@localhost:5432/surfsense_test",
+)
import pytest
-from dotenv import load_dotenv
from app.db import DocumentType
from app.indexing_pipeline.connector_document import ConnectorDocument
-load_dotenv(Path(__file__).resolve().parent.parent / ".env")
-
-# Shared DB URL referenced by both e2e and integration helper functions.
-DATABASE_URL = os.environ.get(
- "TEST_DATABASE_URL",
- os.environ.get("DATABASE_URL", ""),
-).replace("postgresql+asyncpg://", "postgresql://")
-
-
# ---------------------------------------------------------------------------
# Unit test fixtures
# ---------------------------------------------------------------------------
diff --git a/surfsense_backend/tests/integration/document_upload/conftest.py b/surfsense_backend/tests/integration/document_upload/conftest.py
index 325163b44..78dcf1db9 100644
--- a/surfsense_backend/tests/integration/document_upload/conftest.py
+++ b/surfsense_backend/tests/integration/document_upload/conftest.py
@@ -24,9 +24,9 @@
from app.app import app
from app.config import config as app_config
-from app.db import DATABASE_URL as APP_DB_URL, Base
+from app.db import Base
from app.services.task_dispatcher import get_task_dispatcher
-from tests.conftest import DATABASE_URL
+from tests.integration.conftest import TEST_DATABASE_URL
from tests.utils.helpers import (
TEST_EMAIL,
auth_headers,
@@ -36,6 +36,7 @@
)
_EMBEDDING_DIM = app_config.embedding_model_instance.dimension
+_ASYNCPG_URL = TEST_DATABASE_URL.replace("postgresql+asyncpg://", "postgresql://")
pytestmark = pytest.mark.integration
@@ -85,7 +86,7 @@ async def dispatch_file_processing(
@pytest.fixture(scope="session")
async def _ensure_tables():
"""Create DB tables and extensions once per session."""
- engine = create_async_engine(APP_DB_URL, poolclass=NullPool)
+ engine = create_async_engine(TEST_DATABASE_URL, poolclass=NullPool)
async with engine.begin() as conn:
await conn.execute(text("CREATE EXTENSION IF NOT EXISTS vector"))
await conn.execute(text("CREATE EXTENSION IF NOT EXISTS pg_trgm"))
@@ -144,7 +145,7 @@ def cleanup_doc_ids() -> list[int]:
@pytest.fixture(scope="session", autouse=True)
async def _purge_test_search_space(search_space_id: int):
"""Delete stale documents from previous runs before the session starts."""
- conn = await asyncpg.connect(DATABASE_URL)
+ conn = await asyncpg.connect(_ASYNCPG_URL)
try:
result = await conn.execute(
"DELETE FROM documents WHERE search_space_id = $1",
@@ -180,7 +181,7 @@ async def _cleanup_documents(
remaining_ids.append(doc_id)
if remaining_ids:
- conn = await asyncpg.connect(DATABASE_URL)
+ conn = await asyncpg.connect(_ASYNCPG_URL)
try:
await conn.execute(
"DELETE FROM documents WHERE id = ANY($1::int[])",
@@ -196,7 +197,7 @@ async def _cleanup_documents(
async def _get_user_page_usage(email: str) -> tuple[int, int]:
- conn = await asyncpg.connect(DATABASE_URL)
+ conn = await asyncpg.connect(_ASYNCPG_URL)
try:
row = await conn.fetchrow(
'SELECT pages_used, pages_limit FROM "user" WHERE email = $1',
@@ -211,7 +212,7 @@ async def _get_user_page_usage(email: str) -> tuple[int, int]:
async def _set_user_page_limits(
email: str, *, pages_used: int, pages_limit: int
) -> None:
- conn = await asyncpg.connect(DATABASE_URL)
+ conn = await asyncpg.connect(_ASYNCPG_URL)
try:
await conn.execute(
'UPDATE "user" SET pages_used = $1, pages_limit = $2 WHERE email = $3',
From f09b5b0ea4d4f11f5255fd923db60951cc809538 Mon Sep 17 00:00:00 2001
From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com>
Date: Fri, 27 Feb 2026 00:17:39 +0530
Subject: [PATCH 04/12] refactor: replace hardcoded embedding dimension with
dynamic configuration
- Updated the embedding dimension in test configurations to use the value from the application config, enhancing maintainability and consistency across tests.
---
surfsense_backend/tests/integration/conftest.py | 3 ++-
.../integration/indexing_pipeline/test_index_document.py | 5 ++++-
2 files changed, 6 insertions(+), 2 deletions(-)
diff --git a/surfsense_backend/tests/integration/conftest.py b/surfsense_backend/tests/integration/conftest.py
index 119045d29..9dff257df 100644
--- a/surfsense_backend/tests/integration/conftest.py
+++ b/surfsense_backend/tests/integration/conftest.py
@@ -8,6 +8,7 @@
from sqlalchemy.ext.asyncio import AsyncSession, create_async_engine
from sqlalchemy.pool import NullPool
+from app.config import config as app_config
from app.db import (
Base,
DocumentType,
@@ -18,7 +19,7 @@
)
from app.indexing_pipeline.connector_document import ConnectorDocument
-_EMBEDDING_DIM = 1024 # must match the Vector() dimension used in DB column creation
+_EMBEDDING_DIM = app_config.embedding_model_instance.dimension
_DEFAULT_TEST_DB = (
"postgresql+asyncpg://postgres:postgres@localhost:5432/surfsense_test"
diff --git a/surfsense_backend/tests/integration/indexing_pipeline/test_index_document.py b/surfsense_backend/tests/integration/indexing_pipeline/test_index_document.py
index 0065a03e1..2e8ee4d92 100644
--- a/surfsense_backend/tests/integration/indexing_pipeline/test_index_document.py
+++ b/surfsense_backend/tests/integration/indexing_pipeline/test_index_document.py
@@ -1,9 +1,12 @@
import pytest
from sqlalchemy import select
+from app.config import config as app_config
from app.db import Chunk, Document, DocumentStatus
from app.indexing_pipeline.indexing_pipeline_service import IndexingPipelineService
+_EMBEDDING_DIM = app_config.embedding_model_instance.dimension
+
pytestmark = pytest.mark.integration
@@ -144,7 +147,7 @@ async def test_embedding_written_to_db(
reloaded = result.scalars().first()
assert reloaded.embedding is not None
- assert len(reloaded.embedding) == 1024
+ assert len(reloaded.embedding) == _EMBEDDING_DIM
@pytest.mark.usefixtures(
From 1068ea25a77aea950b8eafa90e388ba8b5fa242c Mon Sep 17 00:00:00 2001
From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com>
Date: Fri, 27 Feb 2026 00:45:51 +0530
Subject: [PATCH 05/12] refactor: standardize test database configuration
across test files
---
surfsense_backend/tests/conftest.py | 10 +++++++---
surfsense_backend/tests/integration/conftest.py | 7 +------
2 files changed, 8 insertions(+), 9 deletions(-)
diff --git a/surfsense_backend/tests/conftest.py b/surfsense_backend/tests/conftest.py
index 19349affb..e1f0bc178 100644
--- a/surfsense_backend/tests/conftest.py
+++ b/surfsense_backend/tests/conftest.py
@@ -4,10 +4,14 @@
import os
-os.environ.setdefault(
- "DATABASE_URL",
- "postgresql+asyncpg://postgres:postgres@localhost:5432/surfsense_test",
+_DEFAULT_TEST_DB = (
+ "postgresql+asyncpg://postgres:postgres@localhost:5432/surfsense_test"
)
+TEST_DATABASE_URL = os.environ.get("TEST_DATABASE_URL", _DEFAULT_TEST_DB)
+
+# Force the app to use the test database regardless of any pre-existing
+# DATABASE_URL in the environment (e.g. from .env or shell profile).
+os.environ["DATABASE_URL"] = TEST_DATABASE_URL
import pytest
diff --git a/surfsense_backend/tests/integration/conftest.py b/surfsense_backend/tests/integration/conftest.py
index 9dff257df..8b92a5aa8 100644
--- a/surfsense_backend/tests/integration/conftest.py
+++ b/surfsense_backend/tests/integration/conftest.py
@@ -1,4 +1,3 @@
-import os
import uuid
from unittest.mock import AsyncMock, MagicMock
@@ -18,14 +17,10 @@
User,
)
from app.indexing_pipeline.connector_document import ConnectorDocument
+from tests.conftest import TEST_DATABASE_URL
_EMBEDDING_DIM = app_config.embedding_model_instance.dimension
-_DEFAULT_TEST_DB = (
- "postgresql+asyncpg://postgres:postgres@localhost:5432/surfsense_test"
-)
-TEST_DATABASE_URL = os.environ.get("TEST_DATABASE_URL", _DEFAULT_TEST_DB)
-
@pytest_asyncio.fixture(scope="session")
async def async_engine():
From 7c09958ddc90adc72e1377e1a58be1d2ae647dd6 Mon Sep 17 00:00:00 2001
From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com>
Date: Fri, 27 Feb 2026 01:24:20 +0530
Subject: [PATCH 06/12] refactor: enhance document upload integration tests for
API contract validation
---
.../document_upload/test_document_upload.py | 179 +-----------------
1 file changed, 9 insertions(+), 170 deletions(-)
diff --git a/surfsense_backend/tests/integration/document_upload/test_document_upload.py b/surfsense_backend/tests/integration/document_upload/test_document_upload.py
index 1a9433a8f..49ba9ce0a 100644
--- a/surfsense_backend/tests/integration/document_upload/test_document_upload.py
+++ b/surfsense_backend/tests/integration/document_upload/test_document_upload.py
@@ -1,8 +1,10 @@
"""
-Integration tests for manual document upload.
+Integration tests for manual document upload - HTTP API layer.
-These tests exercise the full pipeline via the HTTP API:
- API upload → inline task dispatch → ETL extraction → chunking → embedding → DB storage
+Each test verifies a distinct user-facing behavior through the public HTTP
+endpoints. Pipeline internals (indexing, chunking, embedding) are covered by
+the ``indexing_pipeline`` test suite; this module focuses on the API contract,
+error handling, auth, and cross-cutting concerns like duplicate detection.
External boundaries mocked: LLM summarization, text embedding, text chunking,
Redis heartbeat. Task dispatch is swapped via DI (InlineTaskDispatcher).
@@ -21,8 +23,6 @@
from tests.utils.helpers import (
FIXTURES_DIR,
- delete_document,
- get_document,
poll_document_status,
upload_file,
upload_multiple_files,
@@ -30,27 +30,14 @@
pytestmark = pytest.mark.integration
-# ---------------------------------------------------------------------------
-# Helpers local to this module
-# ---------------------------------------------------------------------------
-
-
-def _assert_document_ready(doc: dict, *, expected_filename: str) -> None:
- """Common assertions for a successfully processed document."""
- assert doc["title"] == expected_filename
- assert doc["document_type"] == "FILE"
- assert doc["content"], "Document content (summary) should not be empty"
- assert doc["content_hash"], "content_hash should be set"
- assert doc["document_metadata"].get("FILE_NAME") == expected_filename
-
# ---------------------------------------------------------------------------
-# Test A: Upload a .txt file (direct read path)
+# Upload smoke tests (one per distinct code-path: direct-read & ETL)
# ---------------------------------------------------------------------------
class TestTxtFileUpload:
- """Upload a plain-text file and verify the full pipeline."""
+ """Upload a plain-text file (direct-read path) via the HTTP API."""
async def test_upload_txt_returns_document_id(
self,
@@ -89,83 +76,9 @@ async def test_txt_processing_reaches_ready(
for did in doc_ids:
assert statuses[did]["status"]["state"] == "ready"
- async def test_txt_document_fields_populated(
- self,
- client: httpx.AsyncClient,
- headers: dict[str, str],
- search_space_id: int,
- cleanup_doc_ids: list[int],
- ):
- resp = await upload_file(
- client, headers, "sample.txt", search_space_id=search_space_id
- )
- doc_ids = resp.json()["document_ids"]
- cleanup_doc_ids.extend(doc_ids)
-
- await poll_document_status(
- client, headers, doc_ids, search_space_id=search_space_id
- )
-
- doc = await get_document(client, headers, doc_ids[0])
- _assert_document_ready(doc, expected_filename="sample.txt")
-
-
-# ---------------------------------------------------------------------------
-# Test B: Upload a .md file (markdown direct-read path)
-# ---------------------------------------------------------------------------
-
-
-class TestMarkdownFileUpload:
- """Upload a Markdown file and verify the full pipeline."""
-
- async def test_md_processing_reaches_ready(
- self,
- client: httpx.AsyncClient,
- headers: dict[str, str],
- search_space_id: int,
- cleanup_doc_ids: list[int],
- ):
- resp = await upload_file(
- client, headers, "sample.md", search_space_id=search_space_id
- )
- assert resp.status_code == 200
- doc_ids = resp.json()["document_ids"]
- cleanup_doc_ids.extend(doc_ids)
-
- statuses = await poll_document_status(
- client, headers, doc_ids, search_space_id=search_space_id
- )
- for did in doc_ids:
- assert statuses[did]["status"]["state"] == "ready"
-
- async def test_md_document_fields_populated(
- self,
- client: httpx.AsyncClient,
- headers: dict[str, str],
- search_space_id: int,
- cleanup_doc_ids: list[int],
- ):
- resp = await upload_file(
- client, headers, "sample.md", search_space_id=search_space_id
- )
- doc_ids = resp.json()["document_ids"]
- cleanup_doc_ids.extend(doc_ids)
-
- await poll_document_status(
- client, headers, doc_ids, search_space_id=search_space_id
- )
-
- doc = await get_document(client, headers, doc_ids[0])
- _assert_document_ready(doc, expected_filename="sample.md")
-
-
-# ---------------------------------------------------------------------------
-# Test C: Upload a .pdf file (ETL path)
-# ---------------------------------------------------------------------------
-
class TestPdfFileUpload:
- """Upload a PDF and verify it goes through the ETL extraction pipeline."""
+ """Upload a PDF (ETL extraction path) via the HTTP API."""
async def test_pdf_processing_reaches_ready(
self,
@@ -187,26 +100,6 @@ async def test_pdf_processing_reaches_ready(
for did in doc_ids:
assert statuses[did]["status"]["state"] == "ready"
- async def test_pdf_document_fields_populated(
- self,
- client: httpx.AsyncClient,
- headers: dict[str, str],
- search_space_id: int,
- cleanup_doc_ids: list[int],
- ):
- resp = await upload_file(
- client, headers, "sample.pdf", search_space_id=search_space_id
- )
- doc_ids = resp.json()["document_ids"]
- cleanup_doc_ids.extend(doc_ids)
-
- await poll_document_status(
- client, headers, doc_ids, search_space_id=search_space_id, timeout=300.0
- )
-
- doc = await get_document(client, headers, doc_ids[0])
- _assert_document_ready(doc, expected_filename="sample.pdf")
-
# ---------------------------------------------------------------------------
# Test D: Upload multiple files in a single request
@@ -214,7 +107,7 @@ async def test_pdf_document_fields_populated(
class TestMultiFileUpload:
- """Upload several files at once and verify all are processed."""
+ """Upload several files at once and verify the API response contract."""
async def test_multi_upload_returns_all_ids(
self,
@@ -236,28 +129,6 @@ async def test_multi_upload_returns_all_ids(
assert len(body["document_ids"]) == 2
cleanup_doc_ids.extend(body["document_ids"])
- async def test_multi_upload_all_reach_ready(
- self,
- client: httpx.AsyncClient,
- headers: dict[str, str],
- search_space_id: int,
- cleanup_doc_ids: list[int],
- ):
- resp = await upload_multiple_files(
- client,
- headers,
- ["sample.txt", "sample.md"],
- search_space_id=search_space_id,
- )
- doc_ids = resp.json()["document_ids"]
- cleanup_doc_ids.extend(doc_ids)
-
- statuses = await poll_document_status(
- client, headers, doc_ids, search_space_id=search_space_id
- )
- for did in doc_ids:
- assert statuses[did]["status"]["state"] == "ready"
-
# ---------------------------------------------------------------------------
# Test E: Duplicate file upload (same file uploaded twice)
@@ -433,38 +304,6 @@ async def test_no_files_returns_error(
assert resp.status_code in {400, 422}
-# ---------------------------------------------------------------------------
-# Test J: Document deletion after successful upload
-# ---------------------------------------------------------------------------
-
-
-class TestDocumentDeletion:
- """Upload, wait for ready, delete, then verify it's gone."""
-
- async def test_delete_processed_document(
- self,
- client: httpx.AsyncClient,
- headers: dict[str, str],
- search_space_id: int,
- ):
- resp = await upload_file(
- client, headers, "sample.txt", search_space_id=search_space_id
- )
- doc_ids = resp.json()["document_ids"]
- await poll_document_status(
- client, headers, doc_ids, search_space_id=search_space_id
- )
-
- del_resp = await delete_document(client, headers, doc_ids[0])
- assert del_resp.status_code == 200
-
- get_resp = await client.get(
- f"/api/v1/documents/{doc_ids[0]}",
- headers=headers,
- )
- assert get_resp.status_code == 404
-
-
# ---------------------------------------------------------------------------
# Test K: Searchability after upload
# ---------------------------------------------------------------------------
From fd032f37096b9ba8c1e18145d8ceaf5216a34dff Mon Sep 17 00:00:00 2001
From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com>
Date: Fri, 27 Feb 2026 01:48:25 +0530
Subject: [PATCH 07/12] refactor: simplify and clarify documentation in
document upload integration tests
---
.../document_upload/test_document_upload.py | 14 ++++----------
1 file changed, 4 insertions(+), 10 deletions(-)
diff --git a/surfsense_backend/tests/integration/document_upload/test_document_upload.py b/surfsense_backend/tests/integration/document_upload/test_document_upload.py
index 49ba9ce0a..87f953a51 100644
--- a/surfsense_backend/tests/integration/document_upload/test_document_upload.py
+++ b/surfsense_backend/tests/integration/document_upload/test_document_upload.py
@@ -1,16 +1,10 @@
"""
-Integration tests for manual document upload - HTTP API layer.
+Integration tests for the document upload HTTP API.
-Each test verifies a distinct user-facing behavior through the public HTTP
-endpoints. Pipeline internals (indexing, chunking, embedding) are covered by
-the ``indexing_pipeline`` test suite; this module focuses on the API contract,
-error handling, auth, and cross-cutting concerns like duplicate detection.
+Covers the API contract, auth, duplicate detection, and error handling.
+Pipeline internals are tested in the ``indexing_pipeline`` suite.
-External boundaries mocked: LLM summarization, text embedding, text chunking,
-Redis heartbeat. Task dispatch is swapped via DI (InlineTaskDispatcher).
-
-Prerequisites:
- - PostgreSQL + pgvector
+Requires PostgreSQL + pgvector.
"""
from __future__ import annotations
From 836d5293df4dab3260e3e4cfc1daae3918c41c77 Mon Sep 17 00:00:00 2001
From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com>
Date: Fri, 27 Feb 2026 01:52:35 +0530
Subject: [PATCH 08/12] refactor: remove unused TestStatusPolling class from
document upload integration tests
---
.../document_upload/test_document_upload.py | 50 -------------------
1 file changed, 50 deletions(-)
diff --git a/surfsense_backend/tests/integration/document_upload/test_document_upload.py b/surfsense_backend/tests/integration/document_upload/test_document_upload.py
index 87f953a51..13ceae828 100644
--- a/surfsense_backend/tests/integration/document_upload/test_document_upload.py
+++ b/surfsense_backend/tests/integration/document_upload/test_document_upload.py
@@ -335,53 +335,3 @@ async def test_uploaded_document_appears_in_search(
assert doc_ids[0] in result_ids, (
f"Uploaded document {doc_ids[0]} not found in search results: {result_ids}"
)
-
-
-# ---------------------------------------------------------------------------
-# Test L: Status polling returns correct structure
-# ---------------------------------------------------------------------------
-
-
-class TestStatusPolling:
- """Verify the status endpoint returns well-formed responses."""
-
- async def test_status_endpoint_returns_items(
- self,
- client: httpx.AsyncClient,
- headers: dict[str, str],
- search_space_id: int,
- cleanup_doc_ids: list[int],
- ):
- resp = await upload_file(
- client, headers, "sample.txt", search_space_id=search_space_id
- )
- doc_ids = resp.json()["document_ids"]
- cleanup_doc_ids.extend(doc_ids)
-
- status_resp = await client.get(
- "/api/v1/documents/status",
- headers=headers,
- params={
- "search_space_id": search_space_id,
- "document_ids": ",".join(str(d) for d in doc_ids),
- },
- )
- assert status_resp.status_code == 200
-
- body = status_resp.json()
- assert "items" in body
- assert len(body["items"]) == len(doc_ids)
- for item in body["items"]:
- assert "id" in item
- assert "status" in item
- assert "state" in item["status"]
- assert item["status"]["state"] in {
- "pending",
- "processing",
- "ready",
- "failed",
- }
-
- await poll_document_status(
- client, headers, doc_ids, search_space_id=search_space_id
- )
From 78dcce3e063825be8bdac7d24bc63ff4e5eca898 Mon Sep 17 00:00:00 2001
From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com>
Date: Fri, 27 Feb 2026 02:07:14 +0530
Subject: [PATCH 09/12] refactor: update testing documentation for clarity and
structure
---
surfsense_web/content/docs/testing.mdx | 74 +++++++++++++-------------
1 file changed, 37 insertions(+), 37 deletions(-)
diff --git a/surfsense_web/content/docs/testing.mdx b/surfsense_web/content/docs/testing.mdx
index e140b0d1f..dedceeb77 100644
--- a/surfsense_web/content/docs/testing.mdx
+++ b/surfsense_web/content/docs/testing.mdx
@@ -1,22 +1,17 @@
---
title: Testing
-description: Running and writing end-to-end tests for SurfSense
+description: Running and writing tests for SurfSense
---
-SurfSense uses [pytest](https://docs.pytest.org/) for end-to-end testing. Tests are **self-bootstrapping** — they automatically register a test user and discover search spaces, so no manual database setup is required.
+SurfSense uses [pytest](https://docs.pytest.org/) with two test layers: **unit** tests (no database) and **integration** tests (require PostgreSQL + pgvector). Tests are self-bootstrapping — they configure the test database, register a user, and clean up automatically.
## Prerequisites
-Before running tests, make sure the full backend stack is running:
+- **PostgreSQL + pgvector** running locally (database `surfsense_test` will be used)
+- **`REGISTRATION_ENABLED=TRUE`** in your `.env` (this is the default)
+- A working LLM model with a valid API key in `global_llm_config.yaml` (for integration tests)
-- **FastAPI backend**
-- **PostgreSQL + pgvector**
-- **Redis**
-- **Celery worker**
-
-Your backend must have **`REGISTRATION_ENABLED=TRUE`** in its `.env` (this is the default). The tests register their own user on first run.
-
-Your `global_llm_config.yaml` must have at least one working LLM model with a valid API key — document processing uses Auto mode, which routes through the global config.
+No Redis or Celery is required — integration tests use an inline task dispatcher.
## Running Tests
@@ -26,19 +21,19 @@ Your `global_llm_config.yaml` must have at least one working LLM model with a va
uv run pytest
```
-**Run by marker** (e.g., only document tests):
+**Run by marker:**
```bash
-uv run pytest -m document
+uv run pytest -m unit # fast, no DB needed
+uv run pytest -m integration # requires PostgreSQL + pgvector
```
**Available markers:**
| Marker | Description |
|---|---|
-| `document` | Document upload, processing, and deletion tests |
-| `connector` | Connector indexing tests |
-| `chat` | Chat and agent tests |
+| `unit` | Pure logic tests, no DB or external services |
+| `integration` | Tests that require a real PostgreSQL database |
**Useful flags:**
@@ -51,11 +46,11 @@ uv run pytest -m document
## Configuration
-Default pytest options are configured in `surfsense_backend/pyproject.toml`:
+Default pytest options are in `surfsense_backend/pyproject.toml`:
```toml
[tool.pytest.ini_options]
-addopts = "-v --tb=short -x --strict-markers -ra --durations=10"
+addopts = "-v --tb=short -x --strict-markers -ra --durations=5"
```
- `-v` — verbose test names
@@ -63,42 +58,47 @@ addopts = "-v --tb=short -x --strict-markers -ra --durations=10"
- `-x` — stop on first failure
- `--strict-markers` — reject unregistered markers
- `-ra` — show summary of all non-passing tests
-- `--durations=10` — show the 10 slowest tests
+- `--durations=5` — show the 5 slowest tests
## Environment Variables
-All test configuration has sensible defaults. Override via environment variables if needed:
-
| Variable | Default | Description |
|---|---|---|
-| `TEST_BACKEND_URL` | `http://localhost:8000` | Backend URL to test against |
-| `TEST_DATABASE_URL` | Falls back to `DATABASE_URL` | Direct DB connection for test cleanup |
-| `TEST_USER_EMAIL` | `testuser@surfsense.com` | Test user email |
-| `TEST_USER_PASSWORD` | `testpassword123` | Test user password |
+| `TEST_DATABASE_URL` | `postgresql+asyncpg://postgres:postgres@localhost:5432/surfsense_test` | Database URL for tests |
-These can be configured in `surfsense_backend/.env` (see the Testing section at the bottom of `.env.example`).
+The test suite forces `DATABASE_URL` to point at the test database, so your production database is never touched.
-## How It Works
+### Unit Tests
+
+Pure logic tests that run without a database. Cover model validation, chunking, hashing, and summarization.
-Tests are fully self-bootstrapping:
+### Integration Tests
-1. **User creation** — on first run, tests try to log in. If the user doesn't exist, they register via `POST /auth/register`, then log in.
-2. **Search space discovery** — after authentication, tests call `GET /api/v1/searchspaces` and use the first available search space (auto-created during registration).
-3. **Session purge** — before any tests run, a session-scoped fixture deletes all documents in the test search space directly via the database. This handles stuck documents from previous crashed runs that the API refuses to delete (409 Conflict).
-4. **Per-test cleanup** — every test that creates documents adds their IDs to a `cleanup_doc_ids` list. An autouse fixture deletes them after each test via the API, falling back to direct DB access for any stuck documents.
+Require PostgreSQL + pgvector. Split into two suites:
+
+- **`document_upload/`** — Tests the HTTP API through public endpoints: upload, multi-file, duplicate detection, auth, error handling, page limits, and file size limits. Uses an in-process FastAPI client with `ASGITransport`.
+- **`indexing_pipeline/`** — Tests pipeline internals directly: `prepare_for_indexing`, `index()`, and `index_uploaded_file()` covering chunking, embedding, summarization, fallbacks, and error handling.
+
+External boundaries (LLM, embedding, chunking, Redis) are mocked in both suites.
+
+## How It Works
-This means tests work on both fresh databases and existing ones without any manual setup.
+1. **Database setup** — `TEST_DATABASE_URL` defaults to `surfsense_test`. Tables and extensions (`vector`, `pg_trgm`) are created once per session and dropped after.
+2. **Transaction isolation** — Each test runs inside a savepoint that rolls back, so tests don't affect each other.
+3. **User creation** — Integration tests register a test user via `POST /auth/register` on first run, then log in for subsequent requests.
+4. **Search space discovery** — Tests call `GET /api/v1/searchspaces` and use the first available space.
+5. **Cleanup** — A session fixture purges stale documents before tests run. Per-test cleanup deletes documents via API, falling back to direct DB access for stuck records.
## Writing New Tests
-1. Create a test file in the appropriate directory (e.g., `tests/e2e/test_connectors.py`).
-2. Add a module-level marker at the top:
+1. Create a test file in the appropriate directory (`unit/` or `integration/`).
+2. Add the marker at the top of the file:
```python
import pytest
-pytestmark = pytest.mark.connector
+pytestmark = pytest.mark.integration # or pytest.mark.unit
```
-3. Use fixtures from `conftest.py` — `client`, `headers`, `search_space_id`, and `cleanup_doc_ids` are available to all tests.
+3. Use fixtures from `conftest.py` — `client`, `headers`, `search_space_id`, and `cleanup_doc_ids` are available to integration tests. Unit tests get `make_connector_document` and sample ID fixtures.
4. Register any new markers in `pyproject.toml` under `markers`.
From 20ebb0703f1c0956847653f00f393e4f6b7a9e91 Mon Sep 17 00:00:00 2001
From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com>
Date: Fri, 27 Feb 2026 02:15:21 +0530
Subject: [PATCH 10/12] chore: add icons to documentation metadata for improved
visual representation
---
surfsense_web/content/docs/connectors/meta.json | 1 +
surfsense_web/content/docs/docker-installation.mdx | 1 +
surfsense_web/content/docs/how-to/meta.json | 1 +
surfsense_web/content/docs/index.mdx | 1 +
surfsense_web/content/docs/installation.mdx | 1 +
surfsense_web/content/docs/manual-installation.mdx | 1 +
surfsense_web/content/docs/testing.mdx | 1 +
surfsense_web/lib/source.ts | 6 ++++++
8 files changed, 13 insertions(+)
diff --git a/surfsense_web/content/docs/connectors/meta.json b/surfsense_web/content/docs/connectors/meta.json
index c6f7ab383..0c6dfe449 100644
--- a/surfsense_web/content/docs/connectors/meta.json
+++ b/surfsense_web/content/docs/connectors/meta.json
@@ -1,5 +1,6 @@
{
"title": "Connectors",
+ "icon": "Cable",
"pages": [
"google-drive",
"gmail",
diff --git a/surfsense_web/content/docs/docker-installation.mdx b/surfsense_web/content/docs/docker-installation.mdx
index 767240206..3a42746dc 100644
--- a/surfsense_web/content/docs/docker-installation.mdx
+++ b/surfsense_web/content/docs/docker-installation.mdx
@@ -1,6 +1,7 @@
---
title: Docker Installation
description: Setting up SurfSense using Docker
+icon: Container
---
diff --git a/surfsense_web/content/docs/how-to/meta.json b/surfsense_web/content/docs/how-to/meta.json
index 9051b0585..83a5ea9f4 100644
--- a/surfsense_web/content/docs/how-to/meta.json
+++ b/surfsense_web/content/docs/how-to/meta.json
@@ -1,5 +1,6 @@
{
"title": "How to",
+ "icon": "BookOpen",
"pages": ["electric-sql", "realtime-collaboration"],
"defaultOpen": false
}
diff --git a/surfsense_web/content/docs/index.mdx b/surfsense_web/content/docs/index.mdx
index bb07c5f68..6c0450297 100644
--- a/surfsense_web/content/docs/index.mdx
+++ b/surfsense_web/content/docs/index.mdx
@@ -1,6 +1,7 @@
---
title: Prerequisites
description: Required setup's before setting up SurfSense
+icon: ClipboardCheck
---
diff --git a/surfsense_web/content/docs/installation.mdx b/surfsense_web/content/docs/installation.mdx
index f5e948b64..6aa2eeb90 100644
--- a/surfsense_web/content/docs/installation.mdx
+++ b/surfsense_web/content/docs/installation.mdx
@@ -1,6 +1,7 @@
---
title: Installation
description: Current ways to use SurfSense
+icon: Download
---
# Installing SurfSense
diff --git a/surfsense_web/content/docs/manual-installation.mdx b/surfsense_web/content/docs/manual-installation.mdx
index 1d30a12ef..7a974150c 100644
--- a/surfsense_web/content/docs/manual-installation.mdx
+++ b/surfsense_web/content/docs/manual-installation.mdx
@@ -1,6 +1,7 @@
---
title: Manual Installation
description: Setting up SurfSense manually for customized deployments (Preferred)
+icon: Wrench
---
# Manual Installation (Preferred)
diff --git a/surfsense_web/content/docs/testing.mdx b/surfsense_web/content/docs/testing.mdx
index dedceeb77..9c4682b90 100644
--- a/surfsense_web/content/docs/testing.mdx
+++ b/surfsense_web/content/docs/testing.mdx
@@ -1,6 +1,7 @@
---
title: Testing
description: Running and writing tests for SurfSense
+icon: FlaskConical
---
SurfSense uses [pytest](https://docs.pytest.org/) with two test layers: **unit** tests (no database) and **integration** tests (require PostgreSQL + pgvector). Tests are self-bootstrapping — they configure the test database, register a user, and clean up automatically.
diff --git a/surfsense_web/lib/source.ts b/surfsense_web/lib/source.ts
index 9d879b517..32a52c761 100644
--- a/surfsense_web/lib/source.ts
+++ b/surfsense_web/lib/source.ts
@@ -1,7 +1,13 @@
import { loader } from "fumadocs-core/source";
import { docs } from "@/.source/server";
+import { icons } from "lucide-react";
+import { createElement } from "react";
export const source = loader({
baseUrl: "/docs",
source: docs.toFumadocsSource(),
+ icon(icon) {
+ if (icon && icon in icons)
+ return createElement(icons[icon as keyof typeof icons]);
+ },
});
From 394a37576c47f579bde8118cba3b830bbda4328f Mon Sep 17 00:00:00 2001
From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com>
Date: Fri, 27 Feb 2026 02:36:18 +0530
Subject: [PATCH 11/12] refactor: update layout configuration and enhance
documentation layout
---
surfsense_web/app/docs/layout.tsx | 10 +++++++++-
surfsense_web/app/globals.css | 1 +
surfsense_web/app/layout.config.tsx | 4 ++--
3 files changed, 12 insertions(+), 3 deletions(-)
diff --git a/surfsense_web/app/docs/layout.tsx b/surfsense_web/app/docs/layout.tsx
index 27dd5de7a..452106396 100644
--- a/surfsense_web/app/docs/layout.tsx
+++ b/surfsense_web/app/docs/layout.tsx
@@ -3,9 +3,17 @@ import type { ReactNode } from "react";
import { baseOptions } from "@/app/layout.config";
import { source } from "@/lib/source";
+const gridTemplate = `"sidebar header toc"
+"sidebar toc-popover toc"
+"sidebar main toc" 1fr / var(--fd-sidebar-col) minmax(0, 1fr) min-content`;
+
export default function Layout({ children }: { children: ReactNode }) {
return (
-
+ {item.icon} + {item.name} +
+ ); +}