@@ -83,6 +83,29 @@ def _nested_get(data: dict, *keys, default=None):
8383 data = data .get (key , default )
8484 return data
8585
86+ @staticmethod
87+ def _infer_molecule_type_detail (accession : Optional [str ], gene_type : Optional [int ] = None ) -> Optional [str ]:
88+ """Infer molecule_type_detail from accession prefix or gene type."""
89+ if accession :
90+ if accession .startswith (("NM_" , "XM_" )):
91+ return "mRNA"
92+ if accession .startswith (("NC_" , "NT_" )):
93+ return "genomic DNA"
94+ if accession .startswith (("NR_" , "XR_" )):
95+ return "RNA"
96+ if accession .startswith ("NG_" ):
97+ return "genomic region"
98+ # Fallback: infer from gene type if available
99+ if gene_type is not None :
100+ gene_type_map = {
101+ 3 : "rRNA" ,
102+ 4 : "tRNA" ,
103+ 5 : "snRNA" ,
104+ 6 : "ncRNA" ,
105+ }
106+ return gene_type_map .get (gene_type )
107+ return None
108+
86109 def _gene_record_to_dict (self , gene_record , gene_id : str ) -> dict :
87110 """
88111 Convert an Entrez gene record to a dictionary.
@@ -120,7 +143,7 @@ def _gene_record_to_dict(self, gene_record, gene_id: str) -> dict:
120143 else None
121144 )
122145
123- # Extract representative accession
146+ # Extract representative accession (prefer type 3 = mRNA/transcript)
124147 representative_accession = next (
125148 (
126149 product .get ("Gene-commentary_accession" )
@@ -129,6 +152,17 @@ def _gene_record_to_dict(self, gene_record, gene_id: str) -> dict:
129152 ),
130153 None ,
131154 )
155+ # Fallback: if no type 3 accession, try any available accession
156+ # This is needed for genes that don't have mRNA transcripts but have other sequence records
157+ if not representative_accession :
158+ representative_accession = next (
159+ (
160+ product .get ("Gene-commentary_accession" )
161+ for product in locus .get ("Gene-commentary_products" , [])
162+ if product .get ("Gene-commentary_accession" )
163+ ),
164+ None ,
165+ )
132166
133167 # Extract function
134168 function = data .get ("Entrezgene_summary" ) or next (
@@ -169,18 +203,19 @@ def _gene_record_to_dict(self, gene_record, gene_id: str) -> dict:
169203 "sequence" : None ,
170204 "sequence_length" : None ,
171205 "gene_id" : gene_id ,
172- "molecule_type_detail" : None ,
206+ "molecule_type_detail" : self ._infer_molecule_type_detail (
207+ representative_accession , data .get ("Entrezgene_type" )
208+ ),
173209 "_representative_accession" : representative_accession ,
174210 }
175211
176212 def get_by_gene_id (self , gene_id : str , preferred_accession : Optional [str ] = None ) -> Optional [dict ]:
177213 """Get gene information by Gene ID."""
178- def _extract_from_genbank (result : dict , accession : str ):
179- """Enrich result dictionary with sequence and summary information from accession ."""
214+ def _extract_metadata_from_genbank (result : dict , accession : str ):
215+ """Extract metadata from GenBank format (title, features, organism, etc.) ."""
180216 with Entrez .efetch (db = "nuccore" , id = accession , rettype = "gb" , retmode = "text" ) as handle :
181217 record = SeqIO .read (handle , "genbank" )
182- result ["sequence" ] = str (record .seq )
183- result ["sequence_length" ] = len (record .seq )
218+
184219 result ["title" ] = record .description
185220 result ["molecule_type_detail" ] = (
186221 "mRNA" if accession .startswith (("NM_" , "XM_" )) else
@@ -206,6 +241,22 @@ def _extract_from_genbank(result: dict, accession: str):
206241
207242 return result
208243
244+ def _extract_sequence_from_fasta (result : dict , accession : str ):
245+ """Extract sequence from FASTA format (more reliable than GenBank for CON-type records)."""
246+ try :
247+ with Entrez .efetch (db = "nuccore" , id = accession , rettype = "fasta" , retmode = "text" ) as fasta_handle :
248+ fasta_record = SeqIO .read (fasta_handle , "fasta" )
249+ result ["sequence" ] = str (fasta_record .seq )
250+ result ["sequence_length" ] = len (fasta_record .seq )
251+ except Exception as fasta_exc :
252+ logger .warning (
253+ "Failed to extract sequence from accession %s using FASTA format: %s" ,
254+ accession , fasta_exc
255+ )
256+ result ["sequence" ] = None
257+ result ["sequence_length" ] = None
258+ return result
259+
209260 try :
210261 with Entrez .efetch (db = "gene" , id = gene_id , retmode = "xml" ) as handle :
211262 gene_record = Entrez .read (handle )
@@ -214,7 +265,8 @@ def _extract_from_genbank(result: dict, accession: str):
214265
215266 result = self ._gene_record_to_dict (gene_record , gene_id )
216267 if accession := (preferred_accession or result .get ("_representative_accession" )):
217- result = _extract_from_genbank (result , accession )
268+ result = _extract_metadata_from_genbank (result , accession )
269+ result = _extract_sequence_from_fasta (result , accession )
218270
219271 result .pop ("_representative_accession" , None )
220272 return result
0 commit comments