Skip to content

Commit 28e5795

Browse files
committed
feat: enable UniProt search in protein QA pipeline
- Add search step to protein_qa_config.yaml with UniProt API support - Fix search_all return format to match storage structure (doc_hash -> results mapping) - Enhance search_all to support protein type data extraction from protein_caption - Add test_database_access.py to .gitignore
1 parent bfc1d18 commit 28e5795

File tree

3 files changed

+44
-7
lines changed

3 files changed

+44
-7
lines changed

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -177,3 +177,6 @@ cache
177177
*.pyc
178178
*.html
179179
.gradio
180+
181+
# Local test scripts
182+
test_database_access.py

graphgen/configs/protein_qa_config.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,12 @@ pipeline:
33
params:
44
input_file: resources/input_examples/protein_qa_demo.json # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
55

6+
- name: search
7+
params:
8+
data_sources: [uniprot] # data source for searcher
9+
uniprot_params:
10+
use_local_blast: false # false means use online API
11+
612
- name: chunk
713
params:
814
chunk_size: 1024 # chunk size for text splitting

graphgen/operators/search/search_all.py

Lines changed: 35 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ async def search_all(
2020
Perform searches across multiple search types and aggregate the results.
2121
:param seed_data: A dictionary containing seed data with entity names.
2222
:param search_config: A dictionary specifying which data sources to use for searching.
23-
:return: A dictionary with
23+
:return: A dictionary with doc_hash as keys and search results as values.
2424
"""
2525

2626
results = {}
@@ -34,19 +34,47 @@ async def search_all(
3434
**search_config.get("uniprot_params", {})
3535
)
3636

37-
data = list(seed_data.values())
38-
data = [d["content"] for d in data if "content" in d]
39-
data = list(set(data)) # Remove duplicates
37+
# Prepare search queries: map doc_hash to content
38+
doc_queries = {}
39+
for doc_hash, doc_data in seed_data.items():
40+
# Try to extract search query from different data types
41+
query = None
42+
if "content" in doc_data:
43+
query = doc_data["content"]
44+
elif doc_data.get("type") == "protein" and "protein_caption" in doc_data:
45+
# For protein type, try to use sequence, id, or protein_name
46+
protein_caption = doc_data["protein_caption"]
47+
if "sequence" in protein_caption and protein_caption["sequence"]:
48+
query = protein_caption["sequence"]
49+
elif "id" in protein_caption and protein_caption["id"]:
50+
query = protein_caption["id"]
51+
elif "protein_name" in protein_caption and protein_caption["protein_name"]:
52+
query = protein_caption["protein_name"]
53+
54+
if query:
55+
if query not in doc_queries:
56+
doc_queries[query] = []
57+
doc_queries[query].append(doc_hash)
58+
59+
# Get unique queries
60+
unique_queries = list(doc_queries.keys())
61+
62+
# Perform searches
4063
uniprot_results = await run_concurrent(
4164
uniprot_search_client.search,
42-
data,
65+
unique_queries,
4366
desc="Searching UniProt database",
4467
unit="keyword",
4568
)
69+
70+
# Map results back to doc hashes
71+
for query, result in zip(unique_queries, uniprot_results):
72+
for doc_hash in doc_queries[query]:
73+
if doc_hash not in results:
74+
results[doc_hash] = {}
75+
results[doc_hash][data_source] = result
4676
else:
4777
logger.error("Data source %s not supported.", data_source)
4878
continue
4979

50-
results[data_source] = uniprot_results
51-
5280
return results

0 commit comments

Comments
 (0)