Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 26 additions & 4 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,28 @@ You can also pass the ``--profile`` argument::
Configuration
=============

Test Configuration
------------------

The following keys are searched for by the test suite.
They are ignored by the main app.

``RUN_OLLAMA_TESTS``
Set to ``true`` to run the Ollama tests. Should only be set if Ollama is running.

``RUN_E2E_TESTS``
Set to anything (like ``true``) to run the end-to-end tests.
This requires Ollama and a functional TIND key.

``KEEP_E2E_FILES``
Set to anything (like ``true``) when running E2E tests to additionally keep the files
downloaded and created for your own later debugging. This includes the PDF and metadata
records from TIND, and the LanceDB test store. You may remove them when you are done.


App Configuration
-----------------

The following keys are available for configuration in the ``.env`` file:

``TIND_API_KEY``
Expand All @@ -134,9 +156,6 @@ The following keys are available for configuration in the ``.env`` file:
``DEFAULT_STORAGE_DIR``
The default directory to store files retrieved from TIND.

``RUN_OLLAMA_TESTS``
Set to ``true`` to run the Ollama tests. Should only be set if Ollama is running.

``OLLAMA_URL``
Set to the instance of Ollama to use for the Web interface.
Defaults to ``http://localhost:11434``; you may want ``http://ollama:11434`` for Docker runs.
Expand All @@ -153,6 +172,9 @@ The following keys are available for configuration in the ``.env`` file:
``POSTGRES_DB``
The name of the database for the app. Defaults to ``willa``.

``POSTGRES_HOST``
The hostname of the Postgres server. Likely ``db`` in a Docker Compose environment.

``POSTGRES_PORT``
The Postgres port. Defaults to ``5432``.

Expand Down Expand Up @@ -236,7 +258,7 @@ The following keys are available for configuration in the ``.env`` file:
Defaults to '500' if not set.

``K_VALUE``
Int. The k value used for retrieving context from the vector_store. The default is 4
Int. The k value used for retrieving context from the vector_store. The default is 4.

``NULL_AUTH``
Boolean. Whether to allow anyone to login with any name and password. Defaults to ``False``.
Expand Down
Empty file added tests/e2e/__init__.py
Empty file.
79 changes: 79 additions & 0 deletions tests/e2e/test_everything.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
"""
Test suite for the entire ETL pipeline, including external actors.

This differs from ``tests.etl.test_pipeline`` in the following ways:

* We do not mock TIND; we connect to the real thing.
* We use a LanceDB vector store instead of an in-memory vector store.
* We embed documents using the actual configured embeddings provider.
This defaults to ollama to prevent cost issues with Bedrock.
* We process multiple PDF transcripts into the vector store.
* We ensure retrieval of all documents, to prevent AP-503 recurring.
"""

import os.path
import shutil
import tempfile
import unittest

from willa.config import CONFIG
import willa.etl.pipeline


class E2ETest(unittest.TestCase):
"""Test the entire pipeline.

1. Extract - Fetch three records from TIND.
2. Transform/Load - Process them into a LanceDB vector store.
3. Perform a number of queries to ensure the process was successful.

The queries we run ensure results from every document are included.
"""
def setUp(self) -> None:
"""Initialise the environment for the end-to-end test."""
self.temp_dir = tempfile.mkdtemp(prefix='willatest')

storage_dir = os.path.join(self.temp_dir, 'pdfs')
os.mkdir(storage_dir)
CONFIG['DEFAULT_STORAGE_DIR'] = storage_dir

data_dir = os.path.join(self.temp_dir, 'lancedb')
os.mkdir(data_dir)
CONFIG['LANCEDB_URI'] = data_dir

@unittest.skipUnless(os.getenv("RUN_E2E_TESTS"), "requires network, keys, ollama")
def test_e2e_pipeline(self) -> None:
"""Test the pipeline."""
self.assertIn('TIND_API_KEY', CONFIG, 'You must configure TIND API access')
self.assertIn('TIND_API_URL', CONFIG, 'You must configure TIND API access')
self.assertIn('OLLAMA_URL', CONFIG, 'You must have ollama running')
self.assertEqual(CONFIG['EMBED_BACKEND'], 'ollama',
'You must use ollama embeddings for the E2E test')

willa.etl.pipeline.fetch_one_from_tind('219376') # Sierra Club
willa.etl.pipeline.fetch_one_from_tind('218207') # Genentech
willa.etl.pipeline.fetch_one_from_tind('103806') # One from outside our present collections

store = willa.etl.pipeline.run_pipeline()

# The interviewee's name should only appear in their document.
expected = {'Perrault': '219376', 'Itakura': '218207', 'Parnell': '103806'}
# We can reuse the same retriever for each query to save time and memory.
retriever = store.as_retriever(search_kwargs={"k": int(CONFIG['K_VALUE'])})
for name, tind_id in expected.items():
results = retriever.invoke(name)
self.assertEqual(len(results), 4) # default number of docs to return.
metadata = results[0].metadata
self.assertIn('tind_metadata', metadata, "TIND metadata missing!")
tind_md = metadata['tind_metadata']
self.assertIn('tind_id', tind_md,"TIND ID missing!")
self.assertListEqual(tind_md['tind_id'], [tind_id],
f"TIND ID {tind_md['tind_id'][0]} doesn't match {tind_id}")

def tearDown(self) -> None:
"""Remove files, unless `KEEP_E2E_FILES` is present in the environment."""
if os.getenv('KEEP_E2E_FILES'):
print(f"Files in {self.temp_dir} remain for your inspection.")
return

shutil.rmtree(self.temp_dir)