From c850d74b0b95a7e60b54f28047b4a9f302338aac Mon Sep 17 00:00:00 2001 From: "A. Wilcox" Date: Thu, 18 Dec 2025 16:11:48 -0600 Subject: [PATCH] Add end-to-end testing of TIND and LanceDB Implements: AP-517 --- README.rst | 30 ++++++++++++-- tests/e2e/__init__.py | 0 tests/e2e/test_everything.py | 79 ++++++++++++++++++++++++++++++++++++ 3 files changed, 105 insertions(+), 4 deletions(-) create mode 100644 tests/e2e/__init__.py create mode 100644 tests/e2e/test_everything.py diff --git a/README.rst b/README.rst index 93c9793..5233f7f 100644 --- a/README.rst +++ b/README.rst @@ -123,6 +123,28 @@ You can also pass the ``--profile`` argument:: Configuration ============= +Test Configuration +------------------ + +The following keys are searched for by the test suite. +They are ignored by the main app. + +``RUN_OLLAMA_TESTS`` + Set to ``true`` to run the Ollama tests. Should only be set if Ollama is running. + +``RUN_E2E_TESTS`` + Set to anything (like ``true``) to run the end-to-end tests. + This requires Ollama and a functional TIND key. + +``KEEP_E2E_FILES`` + Set to anything (like ``true``) when running E2E tests to additionally keep the files + downloaded and created for your own later debugging. This includes the PDF and metadata + records from TIND, and the LanceDB test store. You may remove them when you are done. + + +App Configuration +----------------- + The following keys are available for configuration in the ``.env`` file: ``TIND_API_KEY`` @@ -134,9 +156,6 @@ The following keys are available for configuration in the ``.env`` file: ``DEFAULT_STORAGE_DIR`` The default directory to store files retrieved from TIND. -``RUN_OLLAMA_TESTS`` - Set to ``true`` to run the Ollama tests. Should only be set if Ollama is running. - ``OLLAMA_URL`` Set to the instance of Ollama to use for the Web interface. Defaults to ``http://localhost:11434``; you may want ``http://ollama:11434`` for Docker runs. @@ -153,6 +172,9 @@ The following keys are available for configuration in the ``.env`` file: ``POSTGRES_DB`` The name of the database for the app. Defaults to ``willa``. +``POSTGRES_HOST`` + The hostname of the Postgres server. Likely ``db`` in a Docker Compose environment. + ``POSTGRES_PORT`` The Postgres port. Defaults to ``5432``. @@ -236,7 +258,7 @@ The following keys are available for configuration in the ``.env`` file: Defaults to '500' if not set. ``K_VALUE`` - Int. The k value used for retrieving context from the vector_store. The default is 4 + Int. The k value used for retrieving context from the vector_store. The default is 4. ``NULL_AUTH`` Boolean. Whether to allow anyone to login with any name and password. Defaults to ``False``. diff --git a/tests/e2e/__init__.py b/tests/e2e/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/e2e/test_everything.py b/tests/e2e/test_everything.py new file mode 100644 index 0000000..64d6ef3 --- /dev/null +++ b/tests/e2e/test_everything.py @@ -0,0 +1,79 @@ +""" +Test suite for the entire ETL pipeline, including external actors. + +This differs from ``tests.etl.test_pipeline`` in the following ways: + +* We do not mock TIND; we connect to the real thing. +* We use a LanceDB vector store instead of an in-memory vector store. +* We embed documents using the actual configured embeddings provider. + This defaults to ollama to prevent cost issues with Bedrock. +* We process multiple PDF transcripts into the vector store. +* We ensure retrieval of all documents, to prevent AP-503 recurring. +""" + +import os.path +import shutil +import tempfile +import unittest + +from willa.config import CONFIG +import willa.etl.pipeline + + +class E2ETest(unittest.TestCase): + """Test the entire pipeline. + + 1. Extract - Fetch three records from TIND. + 2. Transform/Load - Process them into a LanceDB vector store. + 3. Perform a number of queries to ensure the process was successful. + + The queries we run ensure results from every document are included. + """ + def setUp(self) -> None: + """Initialise the environment for the end-to-end test.""" + self.temp_dir = tempfile.mkdtemp(prefix='willatest') + + storage_dir = os.path.join(self.temp_dir, 'pdfs') + os.mkdir(storage_dir) + CONFIG['DEFAULT_STORAGE_DIR'] = storage_dir + + data_dir = os.path.join(self.temp_dir, 'lancedb') + os.mkdir(data_dir) + CONFIG['LANCEDB_URI'] = data_dir + + @unittest.skipUnless(os.getenv("RUN_E2E_TESTS"), "requires network, keys, ollama") + def test_e2e_pipeline(self) -> None: + """Test the pipeline.""" + self.assertIn('TIND_API_KEY', CONFIG, 'You must configure TIND API access') + self.assertIn('TIND_API_URL', CONFIG, 'You must configure TIND API access') + self.assertIn('OLLAMA_URL', CONFIG, 'You must have ollama running') + self.assertEqual(CONFIG['EMBED_BACKEND'], 'ollama', + 'You must use ollama embeddings for the E2E test') + + willa.etl.pipeline.fetch_one_from_tind('219376') # Sierra Club + willa.etl.pipeline.fetch_one_from_tind('218207') # Genentech + willa.etl.pipeline.fetch_one_from_tind('103806') # One from outside our present collections + + store = willa.etl.pipeline.run_pipeline() + + # The interviewee's name should only appear in their document. + expected = {'Perrault': '219376', 'Itakura': '218207', 'Parnell': '103806'} + # We can reuse the same retriever for each query to save time and memory. + retriever = store.as_retriever(search_kwargs={"k": int(CONFIG['K_VALUE'])}) + for name, tind_id in expected.items(): + results = retriever.invoke(name) + self.assertEqual(len(results), 4) # default number of docs to return. + metadata = results[0].metadata + self.assertIn('tind_metadata', metadata, "TIND metadata missing!") + tind_md = metadata['tind_metadata'] + self.assertIn('tind_id', tind_md,"TIND ID missing!") + self.assertListEqual(tind_md['tind_id'], [tind_id], + f"TIND ID {tind_md['tind_id'][0]} doesn't match {tind_id}") + + def tearDown(self) -> None: + """Remove files, unless `KEEP_E2E_FILES` is present in the environment.""" + if os.getenv('KEEP_E2E_FILES'): + print(f"Files in {self.temp_dir} remain for your inspection.") + return + + shutil.rmtree(self.temp_dir)