feat: enable UniProt search in protein QA pipeline

CHERRY-ui8 · CHERRY-ui8 · commit 28e57952247a · 2025-11-25T21:37:37.000+08:00
- Add search step to protein_qa_config.yaml with UniProt API support
- Fix search_all return format to match storage structure (doc_hash -&gt; results mapping)
- Enhance search_all to support protein type data extraction from protein_caption
- Add test_database_access.py to .gitignore
diff --git a/.gitignore b/.gitignore
@@ -177,3 +177,6 @@ cache
 *.pyc
 *.html
 .gradio
+
+# Local test scripts
+test_database_access.py
diff --git a/graphgen/configs/protein_qa_config.yaml b/graphgen/configs/protein_qa_config.yaml
@@ -3,6 +3,12 @@ pipeline:
     params:
       input_file: resources/input_examples/protein_qa_demo.json # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
 
+  - name: search
+    params:
+      data_sources: [uniprot] # data source for searcher
+      uniprot_params:
+        use_local_blast: false # false means use online API
+
   - name: chunk
     params:
         chunk_size: 1024 # chunk size for text splitting
diff --git a/graphgen/operators/search/search_all.py b/graphgen/operators/search/search_all.py
@@ -20,7 +20,7 @@ async def search_all(
     Perform searches across multiple search types and aggregate the results.
     :param seed_data: A dictionary containing seed data with entity names.
     :param search_config: A dictionary specifying which data sources to use for searching.
-    :return: A dictionary with
+    :return: A dictionary with doc_hash as keys and search results as values.
     """
 
     results = {}
@@ -34,19 +34,47 @@ async def search_all(
                 **search_config.get("uniprot_params", {})
             )
 
-            data = list(seed_data.values())
-            data = [d["content"] for d in data if "content" in d]
-            data = list(set(data))  # Remove duplicates
+            # Prepare search queries: map doc_hash to content
+            doc_queries = {}
+            for doc_hash, doc_data in seed_data.items():
+                # Try to extract search query from different data types
+                query = None
+                if "content" in doc_data:
+                    query = doc_data["content"]
+                elif doc_data.get("type") == "protein" and "protein_caption" in doc_data:
+                    # For protein type, try to use sequence, id, or protein_name
+                    protein_caption = doc_data["protein_caption"]
+                    if "sequence" in protein_caption and protein_caption["sequence"]:
+                        query = protein_caption["sequence"]
+                    elif "id" in protein_caption and protein_caption["id"]:
+                        query = protein_caption["id"]
+                    elif "protein_name" in protein_caption and protein_caption["protein_name"]:
+                        query = protein_caption["protein_name"]
+                
+                if query:
+                    if query not in doc_queries:
+                        doc_queries[query] = []
+                    doc_queries[query].append(doc_hash)
+
+            # Get unique queries
+            unique_queries = list(doc_queries.keys())
+            
+            # Perform searches
             uniprot_results = await run_concurrent(
                 uniprot_search_client.search,
-                data,
+                unique_queries,
                 desc="Searching UniProt database",
                 unit="keyword",
             )
+
+            # Map results back to doc hashes
+            for query, result in zip(unique_queries, uniprot_results):
+                for doc_hash in doc_queries[query]:
+                    if doc_hash not in results:
+                        results[doc_hash] = {}
+                    results[doc_hash][data_source] = result
         else:
             logger.error("Data source %s not supported.", data_source)
             continue
 
-        results[data_source] = uniprot_results
-
     return results