Revert "feat: enable UniProt search in protein QA pipeline"

CHERRY-ui8 · CHERRY-ui8 · commit ec35f8236533 · 2025-11-26T17:49:23.000+08:00
This reverts commit 28e5795.
diff --git a/.gitignore b/.gitignore
@@ -177,6 +177,3 @@ cache
 *.pyc
 *.html
 .gradio
-
-# Local test scripts
-test_database_access.py
diff --git a/graphgen/configs/protein_qa_config.yaml b/graphgen/configs/protein_qa_config.yaml
@@ -3,12 +3,6 @@ pipeline:
     params:
       input_file: resources/input_examples/protein_qa_demo.json # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
 
-  - name: search
-    params:
-      data_sources: [uniprot] # data source for searcher
-      uniprot_params:
-        use_local_blast: false # false means use online API
-
   - name: chunk
     params:
         chunk_size: 1024 # chunk size for text splitting
diff --git a/graphgen/operators/search/search_all.py b/graphgen/operators/search/search_all.py
@@ -20,7 +20,7 @@ async def search_all(
     Perform searches across multiple search types and aggregate the results.
     :param seed_data: A dictionary containing seed data with entity names.
     :param search_config: A dictionary specifying which data sources to use for searching.
-    :return: A dictionary with doc_hash as keys and search results as values.
+    :return: A dictionary with
     """
 
     results = {}
@@ -34,47 +34,19 @@ async def search_all(
                 **search_config.get("uniprot_params", {})
             )
 
-            # Prepare search queries: map doc_hash to content
-            doc_queries = {}
-            for doc_hash, doc_data in seed_data.items():
-                # Try to extract search query from different data types
-                query = None
-                if "content" in doc_data:
-                    query = doc_data["content"]
-                elif doc_data.get("type") == "protein" and "protein_caption" in doc_data:
-                    # For protein type, try to use sequence, id, or protein_name
-                    protein_caption = doc_data["protein_caption"]
-                    if "sequence" in protein_caption and protein_caption["sequence"]:
-                        query = protein_caption["sequence"]
-                    elif "id" in protein_caption and protein_caption["id"]:
-                        query = protein_caption["id"]
-                    elif "protein_name" in protein_caption and protein_caption["protein_name"]:
-                        query = protein_caption["protein_name"]
-                
-                if query:
-                    if query not in doc_queries:
-                        doc_queries[query] = []
-                    doc_queries[query].append(doc_hash)
-
-            # Get unique queries
-            unique_queries = list(doc_queries.keys())
-            
-            # Perform searches
+            data = list(seed_data.values())
+            data = [d["content"] for d in data if "content" in d]
+            data = list(set(data))  # Remove duplicates
             uniprot_results = await run_concurrent(
                 uniprot_search_client.search,
-                unique_queries,
+                data,
                 desc="Searching UniProt database",
                 unit="keyword",
             )
-
-            # Map results back to doc hashes
-            for query, result in zip(unique_queries, uniprot_results):
-                for doc_hash in doc_queries[query]:
-                    if doc_hash not in results:
-                        results[doc_hash] = {}
-                    results[doc_hash][data_source] = result
         else:
             logger.error("Data source %s not supported.", data_source)
             continue
 
+        results[data_source] = uniprot_results
+
     return results