Skip to content

Commit bb42b7f

Browse files
workflow: add search_uniprot example
1 parent 9de02f5 commit bb42b7f

File tree

3 files changed

+17
-11
lines changed

3 files changed

+17
-11
lines changed

graphgen/configs/search_config.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
pipeline:
22
- name: read
33
params:
4-
input_file: resources/input_examples/search_demo.json # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
4+
input_file: resources/input_examples/search_demo.jsonl # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
55

66
- name: search
77
params:

graphgen/models/searcher/db/uniprot_searcher.py

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -27,12 +27,16 @@ def _get_pool():
2727
return ThreadPoolExecutor(max_workers=10)
2828

2929

30+
# ensure only one BLAST searcher at a time
31+
_blast_lock = asyncio.Lock()
32+
33+
3034
class UniProtSearch(BaseSearcher):
3135
"""
3236
UniProt Search client to searcher with UniProt.
3337
1) Get the protein by accession number.
3438
2) Search with keywords or protein names (fuzzy searcher).
35-
3) Search with FASTA sequence (BLAST searcher).
39+
3) Search with FASTA sequence (BLAST searcher). Note that NCBIWWW does not support async.
3640
"""
3741

3842
def __init__(self, use_local_blast: bool = False, local_blast_db: str = "sp_db"):
@@ -230,22 +234,21 @@ async def search(
230234
if query.startswith(">") or re.fullmatch(
231235
r"[ACDEFGHIKLMNPQRSTVWY\s]+", query, re.I
232236
):
233-
coro = loop.run_in_executor(
234-
_get_pool(), self.get_by_fasta, query, threshold
235-
)
237+
async with _blast_lock:
238+
result = await loop.run_in_executor(
239+
_get_pool(), self.get_by_fasta, query, threshold
240+
)
236241

237242
# check if accession number
238243
elif re.fullmatch(r"[A-NR-Z0-9]{6,10}", query, re.I):
239-
coro = loop.run_in_executor(_get_pool(), self.get_by_accession, query)
244+
result = await loop.run_in_executor(
245+
_get_pool(), self.get_by_accession, query
246+
)
240247

241248
else:
242249
# otherwise treat as keyword
243-
coro = loop.run_in_executor(_get_pool(), self.get_best_hit, query)
250+
result = await loop.run_in_executor(_get_pool(), self.get_best_hit, query)
244251

245-
result = await coro
246252
if result:
247253
result["_search_query"] = query
248254
return result
249-
250-
251-
# TODO: use local UniProt database for large-scale searchs

scripts/search/search_uniprot.sh

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
python3 -m graphgen.run \
2+
--config_file graphgen/configs/search_config.yaml \
3+
--output_dir cache/

0 commit comments

Comments
 (0)