Skip to content

Commit 737f45d

Browse files
fix: fix dna/rna local blast (#111)
* fix: fix dna/rna local blast * Update scripts/search/build_db/build_rna_blast_db.sh Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> * Update scripts/search/build_db/build_rna_blast_db.sh Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> * fix: fix pylint problems --------- Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
1 parent 761e64b commit 737f45d

File tree

9 files changed

+311
-131
lines changed

9 files changed

+311
-131
lines changed

.gitignore

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -177,3 +177,7 @@ cache
177177
*.pyc
178178
*.html
179179
.gradio
180+
181+
# macOS
182+
.DS_Store
183+
**/.DS_Store

graphgen/configs/search_dna_config.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,5 +13,5 @@ pipeline:
1313
email: test@example.com # NCBI requires an email address
1414
tool: GraphGen # tool name for NCBI API
1515
use_local_blast: true # whether to use local blast for DNA search
16-
local_blast_db: /your_path/refseq_241 # path to local BLAST database (without .nhr extension)
16+
local_blast_db: refseq_release/refseq_release # path to local BLAST database (without .nhr extension)
1717

graphgen/configs/search_rna_config.yaml

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,4 @@ pipeline:
1111
data_sources: [rnacentral] # data source for searcher, support: wikipedia, google, uniprot, ncbi, rnacentral
1212
rnacentral_params:
1313
use_local_blast: true # whether to use local blast for RNA search
14-
local_blast_db: /your_path/refseq_rna_241 # format: /path/to/refseq_rna_${RELEASE}
15-
# can also use DNA database with RNA sequences (if already built)
16-
14+
local_blast_db: rnacentral_ensembl_gencode_YYYYMMDD/ensembl_gencode_YYYYMMDD # path to local BLAST database (without .nhr extension)

graphgen/models/searcher/db/ncbi_searcher.py

Lines changed: 59 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,29 @@ def _nested_get(data: dict, *keys, default=None):
8383
data = data.get(key, default)
8484
return data
8585

86+
@staticmethod
87+
def _infer_molecule_type_detail(accession: Optional[str], gene_type: Optional[int] = None) -> Optional[str]:
88+
"""Infer molecule_type_detail from accession prefix or gene type."""
89+
if accession:
90+
if accession.startswith(("NM_", "XM_")):
91+
return "mRNA"
92+
if accession.startswith(("NC_", "NT_")):
93+
return "genomic DNA"
94+
if accession.startswith(("NR_", "XR_")):
95+
return "RNA"
96+
if accession.startswith("NG_"):
97+
return "genomic region"
98+
# Fallback: infer from gene type if available
99+
if gene_type is not None:
100+
gene_type_map = {
101+
3: "rRNA",
102+
4: "tRNA",
103+
5: "snRNA",
104+
6: "ncRNA",
105+
}
106+
return gene_type_map.get(gene_type)
107+
return None
108+
86109
def _gene_record_to_dict(self, gene_record, gene_id: str) -> dict:
87110
"""
88111
Convert an Entrez gene record to a dictionary.
@@ -120,7 +143,7 @@ def _gene_record_to_dict(self, gene_record, gene_id: str) -> dict:
120143
else None
121144
)
122145

123-
# Extract representative accession
146+
# Extract representative accession (prefer type 3 = mRNA/transcript)
124147
representative_accession = next(
125148
(
126149
product.get("Gene-commentary_accession")
@@ -129,6 +152,17 @@ def _gene_record_to_dict(self, gene_record, gene_id: str) -> dict:
129152
),
130153
None,
131154
)
155+
# Fallback: if no type 3 accession, try any available accession
156+
# This is needed for genes that don't have mRNA transcripts but have other sequence records
157+
if not representative_accession:
158+
representative_accession = next(
159+
(
160+
product.get("Gene-commentary_accession")
161+
for product in locus.get("Gene-commentary_products", [])
162+
if product.get("Gene-commentary_accession")
163+
),
164+
None,
165+
)
132166

133167
# Extract function
134168
function = data.get("Entrezgene_summary") or next(
@@ -169,18 +203,19 @@ def _gene_record_to_dict(self, gene_record, gene_id: str) -> dict:
169203
"sequence": None,
170204
"sequence_length": None,
171205
"gene_id": gene_id,
172-
"molecule_type_detail": None,
206+
"molecule_type_detail": self._infer_molecule_type_detail(
207+
representative_accession, data.get("Entrezgene_type")
208+
),
173209
"_representative_accession": representative_accession,
174210
}
175211

176212
def get_by_gene_id(self, gene_id: str, preferred_accession: Optional[str] = None) -> Optional[dict]:
177213
"""Get gene information by Gene ID."""
178-
def _extract_from_genbank(result: dict, accession: str):
179-
"""Enrich result dictionary with sequence and summary information from accession."""
214+
def _extract_metadata_from_genbank(result: dict, accession: str):
215+
"""Extract metadata from GenBank format (title, features, organism, etc.)."""
180216
with Entrez.efetch(db="nuccore", id=accession, rettype="gb", retmode="text") as handle:
181217
record = SeqIO.read(handle, "genbank")
182-
result["sequence"] = str(record.seq)
183-
result["sequence_length"] = len(record.seq)
218+
184219
result["title"] = record.description
185220
result["molecule_type_detail"] = (
186221
"mRNA" if accession.startswith(("NM_", "XM_")) else
@@ -206,6 +241,22 @@ def _extract_from_genbank(result: dict, accession: str):
206241

207242
return result
208243

244+
def _extract_sequence_from_fasta(result: dict, accession: str):
245+
"""Extract sequence from FASTA format (more reliable than GenBank for CON-type records)."""
246+
try:
247+
with Entrez.efetch(db="nuccore", id=accession, rettype="fasta", retmode="text") as fasta_handle:
248+
fasta_record = SeqIO.read(fasta_handle, "fasta")
249+
result["sequence"] = str(fasta_record.seq)
250+
result["sequence_length"] = len(fasta_record.seq)
251+
except Exception as fasta_exc:
252+
logger.warning(
253+
"Failed to extract sequence from accession %s using FASTA format: %s",
254+
accession, fasta_exc
255+
)
256+
result["sequence"] = None
257+
result["sequence_length"] = None
258+
return result
259+
209260
try:
210261
with Entrez.efetch(db="gene", id=gene_id, retmode="xml") as handle:
211262
gene_record = Entrez.read(handle)
@@ -214,7 +265,8 @@ def _extract_from_genbank(result: dict, accession: str):
214265

215266
result = self._gene_record_to_dict(gene_record, gene_id)
216267
if accession := (preferred_accession or result.get("_representative_accession")):
217-
result = _extract_from_genbank(result, accession)
268+
result = _extract_metadata_from_genbank(result, accession)
269+
result = _extract_sequence_from_fasta(result, accession)
218270

219271
result.pop("_representative_accession", None)
220272
return result
Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,4 @@
1-
{"type": "text", "content": "TP53"}
2-
{"type": "text", "content": "BRCA1"}
3-
{"type": "text", "content": "672"}
4-
{"type": "text", "content": "11998"}
5-
{"type": "text", "content": "NM_000546"}
6-
{"type": "text", "content": "NM_024140"}
7-
{"type": "text", "content": ">query\nCTCAAAAGTCTAGAGCCACCGTCCAGGGAGCAGGTAGCTGCTGGGCTCCGGGGACACTTTGCGTTCGGGCTGGGAGCGTGCTTTCCACGACGGTGACACGCTTCCCTGGATTGGCAGCCAGACTGCCTTCCGGGTCACTGCCATGGAGGAGCCGCAGTCAGATCCTAGCGTCGAGCCCCCTCTGAGTCAGGAAACATTTTCAGACCTATGGAAACTACTTCCTGAAAACAACGTTCTGTCCCCCTTGCCGTCCCAAGCAATGGATGATTTGATGCTGTCCCCGGACGATATTGAACAATGGTTCACTGAAGACCCAGGTCCAGATGAAGCTCCCAGAATGCCAGAGGCTGCTCCCCCCGTGGCCCCTGCACCAGCAGCTCCTACACCGGCGGCCCCTGCACCAGCCCCCTCCTGGCCCCTGTCATCTTCTGTCCCTTCCCAGAAAACCTACCAGGGCAGCTACGGTTTCCGTCTGGGCTTCTTGCATTCTGGGACAGCCAAGTCTGTGACTTGCACGTACTCCCCTGCCCTCAACAAGATGTTTTGCCAACTGGCCAAGACCTGCCCTGTGCAGCTGTGGGTTGATTCCACACCCCCGCCCGGCACCCGCGTCCGCGCCATGGCCATCTACAAGCAGTCACAGCACATGACGGAGGTTGTGAGGCGCTGCCCCCACCATGAGCGCTGCTCAGATAGCGATGGTCTGGCCCCTCCTCAGCATCTTATCCGAGTGGAAGGAAATTTGCGTGTGGAGTATTTGGATGACAGAAACACTTTTCGACATAGTGTGGTGGTGCCCTATGAGCCGCCTGAGGTTGGCTCTGACTGTACCACCATCCACTACAACTACATGTGTAACAGTTCCTGCATGGGCGGCATGAACCGGAGGCCCATCCTCACCATCATCACACTGGAAGACTCCAGTGGTAATCTACTGGGACGGAACAGCTTTGAGGTGCGTGTTTGTGCCTGTCCTGGGAGAGACCGGCGCACAGAGGAAGAGAATCTCCGCAAGAAAGGGGAGCCTCACCACGAGCTGCCCCCAGGGAGCACTAAGCGAGCACTGCCCAACAACACCAGCTCCTCTCCCCAGCCAAAGAAGAAACCACTGGATGGAGAATATTTCACCCTTCAGATCCGTGGGCGTGAGCGCTTCGAGATGTTCCGAGAGCTGAATGAGGCCTTGGAACTCAAGGATGCCCAGGCTGGGAAGGAGCCAGGGGGGAGCAGGGCTCACTCCAGCCACCTGAAGTCCAAAAAGGGTCAGTCTACCTCCCGCCATAAAAAACTCATGTTCAAGACAGAAGGGCCTGACTCAGACTGACATTCTCCACTTCTTGTTCCCCACTGACAGCCTCCCACCCCCATCTCTCCCTCCCCTGCCATTTTGGGTTTTGGGTCTTTGAACCCTTGCTTGCAATAGGTGTGCGTCAGAAGCACCCAGGACTTCCATTTGCTTTGTCCCGGGGCTCCACTGAACAAGTTGGCCTGCACTGGTGTTTTGTTGTGGGGAGGAGGATGGGGAGTAGGACATACCAGCTTAGATTTTAAGGTTTTTACTGTGAGGGATGTTTGGGAGATGTAAGAAATGTTCTTGCAGTTAAGGGTTAGTTTACAATCAGCCACATTCTAGGTAGGGGCCCACTTCACCGTACTAACCAGGGAAGCTGTCCCTCACTGTTGAATTTTCTCTAACTTCAAGGCCCATATCTGTGAAATGCTGGCATTTGCACCTACCTCACAGAGTGCATTGTGAGGGTTAATGAAATAATGTACATCTGGCCTTGAAACCACCTTTTATTACATGGGGTCTAGAACTTGACCCCCTTGAGGGTGCTTGTTCCCTCTCCCTGTTGGTCGGTGGGTTGGTAGTTTCTACAGTTGGGCAGCTGGTTAGGTAGAGGGAGTTGTCAAGTCTCTGCTGGCCCAGCCAAACCCTGTCTGACAACCTCTTGGTGAACCTTAGTACCTAAAAGGAAATCTCACCCCATCCCACACCCTGGAGGATTTCATCTCTTGTATATGATGATCTGGATCCACCAAGACTTGTTTTATGCTCAGGGTCAATTTCTTTTTTCTTTTTTTTTTTTTTTTTTCTTTTTCTTTGAGACTGGGTCTCGCTTTGTTGCCCAGGCTGGAGTGGAGTGGCGTGATCTTGGCTTACTGCAGCCTTTGCCTCCCCGGCTCGAGCAGTCCTGCCTCAGCCTCCGGAGTAGCTGGGACCACAGGTTCATGCCACCATGGCCAGCCAACTTTTGCATGTTTTGTAGAGATGGGGTCTCACAGTGTTGCCCAGGCTGGTCTCAAACTCCTGGGCTCAGGCGATCCACCTGTCTCAGCCTCCCAGAGTGCTGGGATTACAATTGTGAGCCACCACGTCCAGCTGGAAGGGTCAACATCTTTTACATTCTGCAAGCACATCTGCATTTTCACCCCACCCTTCCCCTCCTTCTCCCTTTTTATATCCCATTTTTATATCGATCTCTTATTTTACAATAAAACTTTGCTGCCA"}
8-
{"type": "text", "content": "CTCAAAAGTCTAGAGCCACCGTCCAGGGAGCAGGTAGCTGCTGGGCTCCGGGGACACTTTGCGTTCGGGCTGGGAGCGTGCTTTCCACGACGGTGACACGCTTCCCTGGATTGGCAGCCAGACTGCCTTCCGGGTCACTGCCATGGAGGAGCCGCAGTCAGATCCTAGCGTCGAGCCCCCTCTGAGTCAGGAAACATTTTCAGACCTATGGAAACTACTTCCTGAAAACAACGTTCTGTCCCCCTTGCCGTCCCAAGCAATGGATGATTTGATGCTGTCCCCGGACGATATTGAACAATGGTTCACTGAAGACCCAGGTCCAGATGAAGCTCCCAGAATGCCAGAGGCTGCTCCCCCCGTGGCCCCTGCACCAGCAGCTCCTACACCGGCGGCCCCTGCACCAGCCCCCTCCTGGCCCCTGTCATCTTCTGTCCCTTCCCAGAAAACCTACCAGGGCAGCTACGGTTTCCGTCTGGGCTTCTTGCATTCTGGGACAGCCAAGTCTGTGACTTGCACGTACTCCCCTGCCCTCAACAAGATGTTTTGCCAACTGGCCAAGACCTGCCCTGTGCAGCTGTGGGTTGATTCCACACCCCCGCCCGGCACCCGCGTCCGCGCCATGGCCATCTACAAGCAGTCACAGCACATGACGGAGGTTGTGAGGCGCTGCCCCCACCATGAGCGCTGCTCAGATAGCGATGGTCTGGCCCCTCCTCAGCATCTTATCCGAGTGGAAGGAAATTTGCGTGTGGAGTATTTGGATGACAGAAACACTTTTCGACATAGTGTGGTGGTGCCCTATGAGCCGCCTGAGGTTGGCTCTGACTGTACCACCATCCACTACAACTACATGTGTAACAGTTCCTGCATGGGCGGCATGAACCGGAGGCCCATCCTCACCATCATCACACTGGAAGACTCCAGTGGTAATCTACTGGGACGGAACAGCTTTGAGGTGCGTGTTTGTGCCTGTCCTGGGAGAGACCGGCGCACAGAGGAAGAGAATCTCCGCAAGAAAGGGGAGCCTCACCACGAGCTGCCCCCAGGGAGCACTAAGCGAGCACTGCCCAACAACACCAGCTCCTCTCCCCAGCCAAAGAAGAAACCACTGGATGGAGAATATTTCACCCTTCAGATCCGTGGGCGTGAGCGCTTCGAGATGTTCCGAGAGCTGAATGAGGCCTTGGAACTCAAGGATGCCCAGGCTGGGAAGGAGCCAGGGGGGAGCAGGGCTCACTCCAGCCACCTGAAGTCCAAAAAGGGTCAGTCTACCTCCCGCCATAAAAAACTCATGTTCAAGACAGAAGGGCCTGACTCAGACTGACATTCTCCACTTCTTGTTCCCCACTGACAGCCTCCCACCCCCATCTCTCCCTCCCCTGCCATTTTGGGTTTTGGGTCTTTGAACCCTTGCTTGCAATAGGTGTGCGTCAGAAGCACCCAGGACTTCCATTTGCTTTGTCCCGGGGCTCCACTGAACAAGTTGGCCTGCACTGGTGTTTTGTTGTGGGGAGGAGGATGGGGAGTAGGACATACCAGCTTAGATTTTAAGGTTTTTACTGTGAGGGATGTTTGGGAGATGTAAGAAATGTTCTTGCAGTTAAGGGTTAGTTTACAATCAGCCACATTCTAGGTAGGGGCCCACTTCACCGTACTAACCAGGGAAGCTGTCCCTCACTGTTGAATTTTCTCTAACTTCAAGGCCCATATCTGTGAAATGCTGGCATTTGCACCTACCTCACAGAGTGCATTGTGAGGGTTAATGAAATAATGTACATCTGGCCTTGAAACCACCTTTTATTACATGGGGTCTAGAACTTGACCCCCTTGAGGGTGCTTGTTCCCTCTCCCTGTTGGTCGGTGGGTTGGTAGTTTCTACAGTTGGGCAGCTGGTTAGGTAGAGGGAGTTGTCAAGTCTCTGCTGGCCCAGCCAAACCCTGTCTGACAACCTCTTGGTGAACCTTAGTACCTAAAAGGAAATCTCACCCCATCCCACACCCTGGAGGATTTCATCTCTTGTATATGATGATCTGGATCCACCAAGACTTGTTTTATGCTCAGGGTCAATTTCTTTTTTCTTTTTTTTTTTTTTTTTTCTTTTTCTTTGAGACTGGGTCTCGCTTTGTTGCCCAGGCTGGAGTGGAGTGGCGTGATCTTGGCTTACTGCAGCCTTTGCCTCCCCGGCTCGAGCAGTCCTGCCTCAGCCTCCGGAGTAGCTGGGACCACAGGTTCATGCCACCATGGCCAGCCAACTTTTGCATGTTTTGTAGAGATGGGGTCTCACAGTGTTGCCCAGGCTGGTCTCAAACTCCTGGGCTCAGGCGATCCACCTGTCTCAGCCTCCCAGAGTGCTGGGATTACAATTGTGAGCCACCACGTCCAGCTGGAAGGGTCAACATCTTTTACATTCTGCAAGCACATCTGCATTTTCACCCCACCCTTCCCCTCCTTCTCCCTTTTTATATCCCATTTTTATATCGATCTCTTATTTTACAATAAAACTTTGCTGCCA"}
9-
1+
{"type": "text", "content": "NG_033923"}
2+
{"type": "text", "content": "NG_056118"}
3+
{"type": "text", "content": ">query\nACTCAATTGTCCCAGCAGCATCTACCGAAAAGCCCCCTTGCTGTTCCTGCCAACTTGAAGCCCGGAGGCCTGCTGGGAGGAGGAATTCTAAATGACAAGTATGCCTGGAAAGCTGTGGTCCAAGGCCGTTTTTGCCGTCAGCAGGATCTCCAGAACCAAAGGGAGGACACAGCTCTTCTTAAAACTGAAGGTATTTATGGCTGACATAAAATGAGATTTGATTTGGGCAGGAAATGCGCTTATGTGTACAAAGAATAATACTGACTCCTGGCAGCAAACCAAACAAAACCAGAGTAAGGTGGAGAAAGGTAACGTGTGCCCACGGAAACAGTGGCACAATGTGTGCCTAATTCCAAAGCAGCCGTCCTGCTTAGGCCACTAGTCACGGCGGCTCTGTGATGCTGTACTCCTCAAGGATTTGAACTAATGAAAAGTAAATAAATACCAGTAAAAGTGGATTTGTAAAAAGAAAAGAAAAATGATAGGAAAAGCCCCTTTACCATATGTCAAGGGTTTATGCTG"}
4+
{"type": "text", "content": "ACTCAATTGTCCCAGCAGCATCTACCGAAAAGCCCCCTTGCTGTTCCTGCCAACTTGAAGCCCGGAGGCCTGCTGGGAGGAGGAATTCTAAATGACAAGTATGCCTGGAAAGCTGTGGTCCAAGGCCGTTTTTGCCGTCAGCAGGATCTCCAGAACCAAAGGGAGGACACAGCTCTTCTTAAAACTGAAGGTATTTATGGCTGACATAAAATGAGATTTGATTTGGGCAGGAAATGCGCTTATGTGTACAAAGAATAATACTGACTCCTGGCAGCAAACCAAACAAAACCAGAGTAAGGTGGAGAAAGGTAACGTGTGCCCACGGAAACAGTGGCACAATGTGTGCCTAATTCCAAAGCAGCCGTCCTGCTTAGGCCACTAGTCACGGCGGCTCTGTGATGCTGTACTCCTCAAGGATTTGAACTAATGAAAAGTAAATAAATACCAGTAAAAGTGGATTTGTAAAAAGAAAAGAAAAATGATAGGAAAAGCCCCTTTACCATATGTCAAGGGTTTATGCTG"}
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
{"type": "text", "content": "hsa-let-7a-1"}
2+
{"type": "text", "content": "XIST regulator"}
23
{"type": "text", "content": "URS0000123456"}
34
{"type": "text", "content": "URS0000000001"}
5+
{"type": "text", "content": "URS0000000787"}
6+
{"type": "text", "content": "GCAGTTCTCAGCCATGACAGATGGGAGTTTCGGCCCAATTGACCAGTATTCCTTACTGATAAGAGACACTGACCATGGAGTGGTTCTGGTGAGATGACATGACCCTCGTGAAGGGGCCTGAAGCTTCATTGTGTTTGTGTATGTTTCTCTCTTCAAAAATATTCATGACTTCTCCTGTAGCTTGATAAATATGTATATTTACACACTGCA"}
47
{"type": "text", "content": ">query\nCUCCUUUGACGUUAGCGGCGGACGGGUUAGUAACACGUGGGUAACCUACCUAUAAGACUGGGAUAACUUCGGGAAACCGGAGCUAAUACCGGAUAAUAUUUCGAACCGCAUGGUUCGAUAGUGAAAGAUGGUUUUGCUAUCACUUAUAGAUGGACCCGCGCCGUAUUAGCUAGUUGGUAAGGUAACGGCUUACCAAGGCGACGAUACGUAGCCGACCUGAGAGGGUGAUCGGCCACACUGGAACUGAGACACGGUCCAGACUCCUACGGGAGGCAGCAGGGG"}
58
{"type": "text", "content": "CUCCUUUGACGUUAGCGGCGGACGGGUUAGUAACACGUGGGUAACCUACCUAUAAGACUGGGAUAACUUCGGGAAACCGGAGCUAAUACCGGAUAAUAUUUCGAACCGCAUGGUUCGAUAGUGAAAGAUGGUUUUGCUAUCACUUAUAGAUGGACCCGCGCCGUAUUAGCUAGUUGGUAAGGUAACGGCUUACCAAGGCGACGAUACGUAGCCGACCUGAGAGGGUGAUCGGCCACACUGGAACUGAGACACGGUCCAGACUCCUACGGGAGGCAGCAGGGG"}

scripts/search/build_db/build_dna_blast_db.sh

Lines changed: 66 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,8 @@ set -e
2424
# - {category}.{number}.genomic.fna.gz (基因组序列)
2525
# - {category}.{number}.rna.fna.gz (RNA序列)
2626
#
27-
# Usage: ./build_dna_blast_db.sh [representative|complete|all]
27+
# Usage: ./build_dna_blast_db.sh [human_mouse|representative|complete|all]
28+
# human_mouse: Download only Homo sapiens and Mus musculus sequences (minimal, smallest)
2829
# representative: Download genomic sequences from major categories (recommended, smaller)
2930
# Includes: vertebrate_mammalian, vertebrate_other, bacteria, archaea, fungi
3031
# complete: Download all complete genomic sequences from complete/ directory (very large)
@@ -35,7 +36,7 @@ set -e
3536
# For CentOS/RHEL/Fedora: sudo dnf install ncbi-blast+
3637
# Or download from: https://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/
3738

38-
DOWNLOAD_TYPE=${1:-representative}
39+
DOWNLOAD_TYPE=${1:-human_mouse}
3940

4041
# Better to use a stable DOWNLOAD_TMP name to support resuming downloads
4142
DOWNLOAD_TMP=_downloading_dna
@@ -57,8 +58,66 @@ else
5758
echo "Using date as release identifier: ${RELEASE}"
5859
fi
5960

61+
# Function to check if a file contains target species
62+
check_file_for_species() {
63+
local url=$1
64+
local filename=$2
65+
local temp_file="/tmp/check_${filename//\//_}"
66+
67+
# Download first 500KB (enough to get many sequence headers)
68+
# This should be sufficient to identify the species in most cases
69+
if curl -s --max-time 30 --range 0-512000 "${url}" -o "${temp_file}" 2>/dev/null && [ -s "${temp_file}" ]; then
70+
# Try to decompress and check for species names
71+
if gunzip -c "${temp_file}" 2>/dev/null | head -2000 | grep -qE "(Homo sapiens|Mus musculus)"; then
72+
rm -f "${temp_file}"
73+
return 0 # Contains target species
74+
else
75+
rm -f "${temp_file}"
76+
return 1 # Does not contain target species
77+
fi
78+
else
79+
# If partial download fails, skip this file (don't download it)
80+
rm -f "${temp_file}"
81+
return 1
82+
fi
83+
}
84+
6085
# Download based on type
6186
case ${DOWNLOAD_TYPE} in
87+
human_mouse)
88+
echo "Downloading RefSeq sequences for Homo sapiens and Mus musculus only (minimal size)..."
89+
echo "This will check each file to see if it contains human or mouse sequences..."
90+
category="vertebrate_mammalian"
91+
echo "Checking files in ${category} category..."
92+
93+
# Get list of files and save to temp file to avoid subshell issues
94+
curl -s "https://ftp.ncbi.nlm.nih.gov/refseq/release/${category}/" | \
95+
grep -oE 'href="[^"]*\.genomic\.fna\.gz"' | \
96+
sed 's/href="\(.*\)"/\1/' > /tmp/refseq_files.txt
97+
98+
file_count=0
99+
download_count=0
100+
101+
while read filename; do
102+
file_count=$((file_count + 1))
103+
url="https://ftp.ncbi.nlm.nih.gov/refseq/release/${category}/${filename}"
104+
echo -n "[${file_count}] Checking ${filename}... "
105+
106+
if check_file_for_species "${url}" "${filename}"; then
107+
echo "✓ contains target species, downloading..."
108+
download_count=$((download_count + 1))
109+
wget -c -q --show-progress "${url}" || {
110+
echo "Warning: Failed to download ${filename}"
111+
}
112+
else
113+
echo "✗ skipping (no human/mouse data)"
114+
fi
115+
done < /tmp/refseq_files.txt
116+
117+
rm -f /tmp/refseq_files.txt
118+
echo ""
119+
echo "Summary: Checked ${file_count} files, downloaded ${download_count} files containing human or mouse sequences."
120+
;;
62121
representative)
63122
echo "Downloading RefSeq representative sequences (recommended, smaller size)..."
64123
# Download major categories for representative coverage
@@ -109,7 +168,11 @@ case ${DOWNLOAD_TYPE} in
109168
;;
110169
*)
111170
echo "Error: Unknown download type '${DOWNLOAD_TYPE}'"
112-
echo "Usage: $0 [representative|complete|all]"
171+
echo "Usage: $0 [human_mouse|representative|complete|all]"
172+
echo " human_mouse: Download only Homo sapiens and Mus musculus (minimal)"
173+
echo " representative: Download major categories (recommended)"
174+
echo " complete: Download all complete genomic sequences (very large)"
175+
echo " all: Download all genomic sequences (extremely large)"
113176
echo "Note: For RNA sequences, use build_rna_blast_db.sh instead"
114177
exit 1
115178
;;

0 commit comments

Comments
 (0)