INPYTHON-764 Make usage of text_key more obvious (#220)

blink1073 · web-flow · commit 221b86432961 · 2025-10-06T15:49:35.000-05:00
diff --git a/libs/langchain-mongodb/langchain_mongodb/retrievers/full_text_search.py b/libs/langchain-mongodb/langchain_mongodb/retrievers/full_text_search.py
@@ -1,3 +1,4 @@
+import warnings
 from typing import Annotated, Any, Dict, List, Optional, Union
 
 from langchain_core.callbacks.manager import CallbackManagerForRetrieverRun
@@ -46,7 +47,13 @@ def _get_relevant_documents(
         Returns:
             List of relevant documents
         """
-        default_k = self.k if self.k is not None else self.top_k
+        is_top_k_set = False
+        with warnings.catch_warnings():
+            # Ignore warning raised by checking the value of top_k.
+            warnings.simplefilter("ignore", DeprecationWarning)
+            if self.top_k is not None:
+                is_top_k_set = True
+        default_k = self.k if not is_top_k_set else self.top_k
         pipeline = text_search_stage(  # type: ignore
             query=query,
             search_field=self.search_field,
diff --git a/libs/langchain-mongodb/langchain_mongodb/retrievers/hybrid_search.py b/libs/langchain-mongodb/langchain_mongodb/retrievers/hybrid_search.py
@@ -1,3 +1,4 @@
+import warnings
 from typing import Annotated, Any, Dict, List, Optional
 
 from langchain_core.callbacks.manager import CallbackManagerForRetrieverRun
@@ -83,8 +84,14 @@ def _get_relevant_documents(
         pipeline: List[Any] = []
 
         # Get the appropriate value for k.
-        default_k = self.top_k if self.top_k is not None else self.k
-        k = kwargs.get("k", default_k)
+        is_top_k_set = False
+        with warnings.catch_warnings():
+            # Ignore warning raised by checking the value of top_k.
+            warnings.simplefilter("ignore", DeprecationWarning)
+            if self.top_k is not None:
+                is_top_k_set = True
+        default_k = self.k if not is_top_k_set else self.top_k
+        k: int = kwargs.get("k", default_k)  # type:ignore[assignment]
 
         # First we build up the aggregation pipeline,
         # then it is passed to the server to execute
diff --git a/libs/langchain-mongodb/langchain_mongodb/vectorstores.py b/libs/langchain-mongodb/langchain_mongodb/vectorstores.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import logging
+import warnings
 from typing import (
     Any,
     Callable,
@@ -108,6 +109,7 @@ class MongoDBAtlasVectorSearch(VectorStore):
                 namespace="db_name.collection_name",
                 embedding=OpenAIEmbeddings(),
                 index_name="vector_index",
+                text_key="text_field"
             )
 
     Add Documents:
@@ -807,15 +809,27 @@ def _similarity_search_with_score(
         docs = []
 
         # Format
+        missing_text_key = False
         for res in cursor:
             if self._text_key not in res:
+                missing_text_key = True
                 continue
             text = res.pop(self._text_key)
             score = res.pop("score")
             make_serializable(res)
             docs.append(
                 (Document(page_content=text, metadata=res, id=res["_id"]), score)
             )
+
+        if (
+            missing_text_key
+            and not len(docs)
+            and self._collection.count_documents({}) > 0
+        ):
+            warnings.warn(
+                f"Could not find any documents with the text_key: '{self._text_key}'",
+                stacklevel=1,
+            )
         return docs
 
     def create_vector_search_index(
diff --git a/libs/langchain-mongodb/pyproject.toml b/libs/langchain-mongodb/pyproject.toml
@@ -45,13 +45,20 @@ dev = [
 ]
 
 [tool.pytest.ini_options]
+minversion = "7"
 addopts = "--snapshot-warn-unused --strict-markers --strict-config --durations=5"
+log_cli_level = "INFO"
+faulthandler_timeout = 1500
+xfail_strict = true
 markers = [
     "requires: mark tests as requiring a specific library",
     "compile: mark placeholder test used to compile integration tests without running them",
 ]
 asyncio_mode = "auto"
 asyncio_default_fixture_loop_scope = "function"
+filterwarnings = [
+    "error"
+]
 
 [tool.mypy]
 disallow_untyped_defs = true
diff --git a/libs/langchain-mongodb/tests/integration_tests/conftest.py b/libs/langchain-mongodb/tests/integration_tests/conftest.py
@@ -1,4 +1,5 @@
 import os
+import warnings
 from typing import Generator, List
 
 import pytest
@@ -16,7 +17,10 @@
 def technical_report_pages() -> List[Document]:
     """Returns a Document for each of the 100 pages of a GPT-4 Technical Report"""
     loader = PyPDFLoader("https://arxiv.org/pdf/2303.08774.pdf")
-    pages = loader.load()
+    with warnings.catch_warnings():
+        # Ignore warnings raised by base class.
+        warnings.simplefilter("ignore", ResourceWarning)
+        pages = loader.load()
     return pages
 
 
diff --git a/libs/langchain-mongodb/tests/integration_tests/test_agent_toolkit.py b/libs/langchain-mongodb/tests/integration_tests/test_agent_toolkit.py
@@ -74,3 +74,4 @@ def test_toolkit_response(db):
     for event in events:
         messages.extend(event["messages"])
     assert "USA" in messages[-1].content, messages[-1].content
+    db_wrapper.close()
diff --git a/libs/langchain-mongodb/tests/integration_tests/test_cache.py b/libs/langchain-mongodb/tests/integration_tests/test_cache.py
@@ -70,7 +70,7 @@ def llm_cache(cls: Any) -> BaseCache:
         )
     )
     assert get_llm_cache()
-    return get_llm_cache()
+    return get_llm_cache()  # type:ignore[return-value]
 
 
 @pytest.fixture(scope="module", autouse=True)
@@ -99,7 +99,7 @@ def _execute_test(
     dumped_prompt: str = prompt if isinstance(prompt, str) else dumps(prompt)
 
     # Update the cache
-    get_llm_cache().update(dumped_prompt, llm_string, response)
+    get_llm_cache().update(dumped_prompt, llm_string, response)  # type:ignore[union-attr]
 
     # Retrieve the cached result through 'generate' call
     output: Union[List[Generation], LLMResult, None]
@@ -156,7 +156,8 @@ def test_mongodb_cache(
     try:
         _execute_test(prompt, llm, response)
     finally:
-        get_llm_cache().clear()
+        get_llm_cache().clear()  # type:ignore[union-attr]
+        get_llm_cache().close()  # type:ignore[attr-defined,union-attr]
 
 
 @pytest.mark.parametrize(
@@ -207,4 +208,5 @@ def test_mongodb_atlas_cache_matrix(
     assert llm.generate(prompts) == LLMResult(
         generations=llm_generations, llm_output={}
     )
-    get_llm_cache().clear()
+    get_llm_cache().clear()  # type:ignore[union-attr]
+    get_llm_cache().close()  # type:ignore[attr-defined,union-attr]
diff --git a/libs/langchain-mongodb/tests/integration_tests/test_chat_message_histories.py b/libs/langchain-mongodb/tests/integration_tests/test_chat_message_histories.py
@@ -1,4 +1,5 @@
 import json
+import warnings
 
 from langchain.memory import ConversationBufferMemory  # type: ignore[import-not-found]
 from langchain_core.messages import message_to_dict
@@ -19,9 +20,12 @@ def test_memory_with_message_store() -> None:
         database_name=DB_NAME,
         collection_name=COLLECTION,
     )
-    memory = ConversationBufferMemory(
-        memory_key="baz", chat_memory=message_history, return_messages=True
-    )
+    with warnings.catch_warnings():
+        # Ignore warnings raised by base class.
+        warnings.simplefilter("ignore", DeprecationWarning)
+        memory = ConversationBufferMemory(
+            memory_key="baz", chat_memory=message_history, return_messages=True
+        )
 
     # add some messages
     memory.chat_memory.add_ai_message("This is me, the AI")
@@ -38,3 +42,4 @@ def test_memory_with_message_store() -> None:
     memory.chat_memory.clear()
 
     assert memory.chat_memory.messages == []
+    memory.chat_memory.close()  # type:ignore[attr-defined]
diff --git a/libs/langchain-mongodb/tests/integration_tests/test_parent_document.py b/libs/langchain-mongodb/tests/integration_tests/test_parent_document.py
@@ -74,3 +74,4 @@ def test_1clxn_retriever(
     assert len(responses) == 3
     assert all("GPT-4" in doc.page_content for doc in responses)
     assert {4, 5, 29} == set(doc.metadata["page"] for doc in responses)
+    client.close()
diff --git a/libs/langchain-mongodb/tests/integration_tests/test_retrievers.py b/libs/langchain-mongodb/tests/integration_tests/test_retrievers.py
@@ -194,12 +194,14 @@ def test_hybrid_retriever_deprecated_top_k(
     )
 
     query1 = "When did I visit France?"
-    results = retriever.invoke(query1)
+    with pytest.warns(DeprecationWarning):
+        results = retriever.invoke(query1)
     assert len(results) == 3
     assert "Paris" in results[0].page_content
 
     query2 = "When was the last time I visited new orleans?"
-    results = retriever.invoke(query2)
+    with pytest.warns(DeprecationWarning):
+        results = retriever.invoke(query2)
     assert "New Orleans" in results[0].page_content
 
 
diff --git a/libs/langchain-mongodb/tests/integration_tests/test_retrievers_multi_field.py b/libs/langchain-mongodb/tests/integration_tests/test_retrievers_multi_field.py
@@ -207,12 +207,14 @@ def test_hybrid_retriever_deprecated_top_k(
     )
 
     query1 = "When did I visit France?"
-    results = retriever.invoke(query1)
+    with pytest.warns(DeprecationWarning):
+        results = retriever.invoke(query1)
     assert len(results) == 3
     assert "Paris" in results[0].page_content
 
     query2 = "When was the last time I visited new orleans?"
-    results = retriever.invoke(query2)
+    with pytest.warns(DeprecationWarning):
+        results = retriever.invoke(query2)
     assert "New Orleans" in results[0].page_content
 
 
diff --git a/libs/langchain-mongodb/tests/integration_tests/test_retrievers_standard.py b/libs/langchain-mongodb/tests/integration_tests/test_retrievers_standard.py
@@ -31,10 +31,12 @@
 SEARCH_INDEX_NAME = "text_index"
 
 
-def setup_test() -> tuple[Collection, MongoDBAtlasVectorSearch]:
+def get_collection() -> Collection:
     client = MongoClient(CONNECTION_STRING)
-    coll = client[DB_NAME][COLLECTION_NAME]
+    return client[DB_NAME][COLLECTION_NAME]
 
+
+def setup_test(coll: Collection) -> MongoDBAtlasVectorSearch:
     # Set up the vector search index and add the documents if needed.
     vs = PatchedMongoDBAtlasVectorSearch(
         coll,
@@ -44,16 +46,16 @@ def setup_test() -> tuple[Collection, MongoDBAtlasVectorSearch]:
         text_key=PAGE_CONTENT_FIELD,
         auto_index_timeout=TIMEOUT,
     )
-
-    if coll.count_documents({}) == 0:
-        vs.add_documents(
-            [
-                Document(page_content="In 2023, I visited Paris"),
-                Document(page_content="In 2022, I visited New York"),
-                Document(page_content="In 2021, I visited New Orleans"),
-                Document(page_content="Sandwiches are beautiful. Sandwiches are fine."),
-            ]
-        )
+    coll.delete_many({})
+
+    vs.add_documents(
+        [
+            Document(page_content="In 2023, I visited Paris"),
+            Document(page_content="In 2022, I visited New York"),
+            Document(page_content="In 2021, I visited New Orleans"),
+            Document(page_content="Sandwiches are beautiful. Sandwiches are fine."),
+        ]
+    )
 
     # Set up the search index if needed.
     if not any([ix["name"] == SEARCH_INDEX_NAME for ix in coll.list_search_indexes()]):
@@ -64,20 +66,30 @@ def setup_test() -> tuple[Collection, MongoDBAtlasVectorSearch]:
             wait_until_complete=TIMEOUT,
         )
 
-    return coll, vs
+    return vs
 
 
 class TestMongoDBAtlasFullTextSearchRetriever(RetrieversIntegrationTests):
+    _coll: Collection
+
+    @classmethod
+    def setup_class(cls):
+        cls._coll = get_collection()
+
+    @classmethod
+    def teardown_class(cls):
+        cls._coll.database.client.close()
+
     @property
     def retriever_constructor(self) -> Type[MongoDBAtlasFullTextSearchRetriever]:
         """Get a retriever for integration tests."""
         return MongoDBAtlasFullTextSearchRetriever
 
     @property
     def retriever_constructor_params(self) -> dict:
-        coll, _ = setup_test()
+        setup_test(self._coll)
         return {
-            "collection": coll,
+            "collection": self._coll,
             "search_index_name": SEARCH_INDEX_NAME,
             "search_field": PAGE_CONTENT_FIELD,
         }
@@ -91,17 +103,27 @@ def retriever_query_example(self) -> str:
 
 
 class TestMongoDBAtlasHybridSearchRetriever(RetrieversIntegrationTests):
+    _coll: Collection
+
+    @classmethod
+    def setup_class(cls):
+        cls._coll = get_collection()
+
+    @classmethod
+    def teardown_class(cls):
+        cls._coll.database.client.close()
+
     @property
     def retriever_constructor(self) -> Type[MongoDBAtlasHybridSearchRetriever]:
         """Get a retriever for integration tests."""
         return MongoDBAtlasHybridSearchRetriever
 
     @property
     def retriever_constructor_params(self) -> dict:
-        coll, vs = setup_test()
+        vs = setup_test(self._coll)
         return {
             "vectorstore": vs,
-            "collection": coll,
+            "collection": self._coll,
             "search_index_name": SEARCH_INDEX_NAME,
             "search_field": PAGE_CONTENT_FIELD,
         }
diff --git a/libs/langchain-mongodb/tests/integration_tests/test_tools.py b/libs/langchain-mongodb/tests/integration_tests/test_tools.py
diff --git a/libs/langchain-mongodb/tests/integration_tests/test_vectorstore_add_delete.py b/libs/langchain-mongodb/tests/integration_tests/test_vectorstore_add_delete.py
diff --git a/libs/langchain-mongodb/tests/integration_tests/test_vectorstore_standard.py b/libs/langchain-mongodb/tests/integration_tests/test_vectorstore_standard.py
diff --git a/libs/langchain-mongodb/tests/unit_tests/test_cache.py b/libs/langchain-mongodb/tests/unit_tests/test_cache.py
diff --git a/libs/langchain-mongodb/tests/unit_tests/test_chat_message_histories.py b/libs/langchain-mongodb/tests/unit_tests/test_chat_message_histories.py
diff --git a/libs/langchain-mongodb/tests/unit_tests/test_index.py b/libs/langchain-mongodb/tests/unit_tests/test_index.py