Skip to content

Commit ec35f82

Browse files
committed
Revert "feat: enable UniProt search in protein QA pipeline"
This reverts commit 28e5795.
1 parent ade5a01 commit ec35f82

File tree

3 files changed

+7
-44
lines changed

3 files changed

+7
-44
lines changed

.gitignore

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -177,6 +177,3 @@ cache
177177
*.pyc
178178
*.html
179179
.gradio
180-
181-
# Local test scripts
182-
test_database_access.py

graphgen/configs/protein_qa_config.yaml

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,6 @@ pipeline:
33
params:
44
input_file: resources/input_examples/protein_qa_demo.json # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
55

6-
- name: search
7-
params:
8-
data_sources: [uniprot] # data source for searcher
9-
uniprot_params:
10-
use_local_blast: false # false means use online API
11-
126
- name: chunk
137
params:
148
chunk_size: 1024 # chunk size for text splitting

graphgen/operators/search/search_all.py

Lines changed: 7 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ async def search_all(
2020
Perform searches across multiple search types and aggregate the results.
2121
:param seed_data: A dictionary containing seed data with entity names.
2222
:param search_config: A dictionary specifying which data sources to use for searching.
23-
:return: A dictionary with doc_hash as keys and search results as values.
23+
:return: A dictionary with
2424
"""
2525

2626
results = {}
@@ -34,47 +34,19 @@ async def search_all(
3434
**search_config.get("uniprot_params", {})
3535
)
3636

37-
# Prepare search queries: map doc_hash to content
38-
doc_queries = {}
39-
for doc_hash, doc_data in seed_data.items():
40-
# Try to extract search query from different data types
41-
query = None
42-
if "content" in doc_data:
43-
query = doc_data["content"]
44-
elif doc_data.get("type") == "protein" and "protein_caption" in doc_data:
45-
# For protein type, try to use sequence, id, or protein_name
46-
protein_caption = doc_data["protein_caption"]
47-
if "sequence" in protein_caption and protein_caption["sequence"]:
48-
query = protein_caption["sequence"]
49-
elif "id" in protein_caption and protein_caption["id"]:
50-
query = protein_caption["id"]
51-
elif "protein_name" in protein_caption and protein_caption["protein_name"]:
52-
query = protein_caption["protein_name"]
53-
54-
if query:
55-
if query not in doc_queries:
56-
doc_queries[query] = []
57-
doc_queries[query].append(doc_hash)
58-
59-
# Get unique queries
60-
unique_queries = list(doc_queries.keys())
61-
62-
# Perform searches
37+
data = list(seed_data.values())
38+
data = [d["content"] for d in data if "content" in d]
39+
data = list(set(data)) # Remove duplicates
6340
uniprot_results = await run_concurrent(
6441
uniprot_search_client.search,
65-
unique_queries,
42+
data,
6643
desc="Searching UniProt database",
6744
unit="keyword",
6845
)
69-
70-
# Map results back to doc hashes
71-
for query, result in zip(unique_queries, uniprot_results):
72-
for doc_hash in doc_queries[query]:
73-
if doc_hash not in results:
74-
results[doc_hash] = {}
75-
results[doc_hash][data_source] = result
7646
else:
7747
logger.error("Data source %s not supported.", data_source)
7848
continue
7949

50+
results[data_source] = uniprot_results
51+
8052
return results

0 commit comments

Comments
 (0)