From 166c1e67e7cfe0161e94bd9656d100b1235dc79e Mon Sep 17 00:00:00 2001 From: chenzihong-gavin Date: Fri, 6 Feb 2026 15:13:39 +0800 Subject: [PATCH 1/3] feat: add reactome_searcher --- .../input_examples/search_reactome_demo.jsonl | 4 + .../search_reactome_config.yaml | 29 + graphgen/models/__init__.py | 2 + .../models/searcher/db/reactome_searcher.py | 620 ++++++++++++++++++ graphgen/operators/search/search_service.py | 5 + 5 files changed, 660 insertions(+) create mode 100644 examples/input_examples/search_reactome_demo.jsonl create mode 100644 examples/search/search_reactome/search_reactome_config.yaml create mode 100644 graphgen/models/searcher/db/reactome_searcher.py diff --git a/examples/input_examples/search_reactome_demo.jsonl b/examples/input_examples/search_reactome_demo.jsonl new file mode 100644 index 00000000..57ebf31f --- /dev/null +++ b/examples/input_examples/search_reactome_demo.jsonl @@ -0,0 +1,4 @@ +{"content": "R-HSA-69278", "type": "reactome_id"} +{"content": "apoptosis", "type": "keyword"} +{"content": "TP53", "type": "gene_symbol"} +{"content": "MAPK1\nERK2\nPTEN", "type": "gene_list"} diff --git a/examples/search/search_reactome/search_reactome_config.yaml b/examples/search/search_reactome/search_reactome_config.yaml new file mode 100644 index 00000000..b05ae91d --- /dev/null +++ b/examples/search/search_reactome/search_reactome_config.yaml @@ -0,0 +1,29 @@ +global_params: + working_dir: cache + kv_backend: rocksdb # key-value store backend, support: rocksdb, json_kv + graph_backend: kuzu # graph database backend, support: kuzu, networkx + +nodes: + - id: read_step + op_name: read + type: source + dependencies: [] + params: + input_path: + - examples/input_examples/search_reactome_demo.jsonl # input file path, support json, jsonl, txt, pdf. See examples/input_examples for examples + + - id: search_step + op_name: search + type: map_batch + dependencies: + - read_step # search_step depends on read_step + execution_params: + replicas: 1 + batch_size: 10 + save_output: true + params: + data_source: reactome # data source for searcher, support: uniprot, ncbi, rnacentral, reactome + reactome_params: + species: "Homo sapiens" # species name, support: Homo sapiens, Mus musculus, Rattus norvegicus, etc. + timeout: 30 # request timeout in seconds + max_retries: 3 # maximum number of retries for failed requests diff --git a/graphgen/models/__init__.py b/graphgen/models/__init__.py index 6b75587c..1cab54d7 100644 --- a/graphgen/models/__init__.py +++ b/graphgen/models/__init__.py @@ -43,6 +43,7 @@ ) from .rephraser import StyleControlledRephraser from .searcher.db.ncbi_searcher import NCBISearch + from .searcher.db.reactome_searcher import ReactomeSearcher from .searcher.db.rnacentral_searcher import RNACentralSearch from .searcher.db.uniprot_searcher import UniProtSearch from .searcher.kg.wiki_search import WikiSearch @@ -97,6 +98,7 @@ # Searcher "NCBISearch": ".searcher.db.ncbi_searcher", "RNACentralSearch": ".searcher.db.rnacentral_searcher", + "ReactomeSearcher": ".searcher.db.reactome_searcher", "UniProtSearch": ".searcher.db.uniprot_searcher", "WikiSearch": ".searcher.kg.wiki_search", "BingSearch": ".searcher.web.bing_search", diff --git a/graphgen/models/searcher/db/reactome_searcher.py b/graphgen/models/searcher/db/reactome_searcher.py new file mode 100644 index 00000000..8f06b3a5 --- /dev/null +++ b/graphgen/models/searcher/db/reactome_searcher.py @@ -0,0 +1,620 @@ +import re +from typing import Dict, List, Optional, Union + +import requests +from requests.exceptions import RequestException, Timeout +from tenacity import ( + retry, + retry_if_exception_type, + stop_after_attempt, + wait_exponential, +) + +from graphgen.bases import BaseSearcher +from graphgen.utils import logger + + +class ReactomeSearcher(BaseSearcher): + """ + Reactome Search client to query biological pathways and reactions. + + Reactome is a free, open-source, curated pathway database with 2,825+ human pathways. + It employs a reductionist data model representing biology as reactions converting + input physical entities into output physical entities. + + Features: + 1) Get pathway/reaction/entity by Reactome stable ID (e.g., R-HSA-69278). + 2) Search with keywords to find pathways, reactions, proteins, or small molecules. + 3) Perform overrepresentation analysis on gene/protein lists to find enriched pathways. + + API Documentation: https://reactome.org/dev/content-service + """ + + CONTENT_BASE_URL = "https://reactome.org/ContentService" + ANALYSIS_BASE_URL = "https://reactome.org/AnalysisService" + DEFAULT_SPECIES = "Homo sapiens" + SUPPORTED_SPECIES = { + "Homo sapiens": "HSA", + "Mus musculus": "MMU", + "Rattus norvegicus": "RNO", + "Gallus gallus": "GGA", + "Danio rerio": "DRE", + "Drosophila melanogaster": "DME", + "Caenorhabditis elegans": "CEL", + "Saccharomyces cerevisiae": "SCE", + } + + def __init__( + self, + species: str = "Homo sapiens", + timeout: int = 30, + max_retries: int = 3, + ): + """ + Initialize Reactome searcher. + + Args: + species: Species name (default: Homo sapiens) + timeout: Request timeout in seconds + max_retries: Maximum number of retries for failed requests + """ + self.species = ( + species if species in self.SUPPORTED_SPECIES else self.DEFAULT_SPECIES + ) + self.species_code = self.SUPPORTED_SPECIES.get(self.species, "HSA") + self.timeout = timeout + self.max_retries = max_retries + self.session = requests.Session() + self.session.headers.update( + { + "Accept": "application/json", + "Content-Type": "application/json", + } + ) + + def _build_url(self, endpoint: str, service: str = "content") -> str: + """Build full API URL.""" + base = self.CONTENT_BASE_URL if service == "content" else self.ANALYSIS_BASE_URL + return f"{base}{endpoint}" + + @retry( + stop=stop_after_attempt(3), + wait=wait_exponential(multiplier=1, min=2, max=10), + retry=retry_if_exception_type((RequestException, Timeout)), + reraise=True, + ) + def _get( + self, endpoint: str, params: Optional[Dict] = None, service: str = "content" + ) -> Optional[Dict]: + """ + Perform GET request with retry logic. + + Args: + endpoint: API endpoint path + params: Query parameters + service: 'content' or 'analysis' + + Returns: + JSON response as dictionary or None if not found + """ + url = self._build_url(endpoint, service) + try: + response = self.session.get(url, params=params, timeout=self.timeout) + response.raise_for_status() + + # Handle both JSON and text responses + content_type = response.headers.get("content-type", "") + if "application/json" in content_type: + return response.json() + return {"text": response.text} + + except requests.HTTPError as e: + if e.response.status_code == 404: + logger.warning("Reactome resource not found: %s", url) + return None + raise + except Timeout: + logger.error("Request timeout for %s", url) + raise + except Exception as exc: + logger.error("Request failed for %s: %s", url, exc) + raise + + @retry( + stop=stop_after_attempt(3), + wait=wait_exponential(multiplier=1, min=2, max=10), + retry=retry_if_exception_type((RequestException, Timeout)), + reraise=True, + ) + def _post( + self, + endpoint: str, + data: Union[str, List[str]], + service: str = "analysis", + headers: Optional[Dict] = None, + ) -> Optional[Dict]: + """ + Perform POST request with retry logic. + + Args: + endpoint: API endpoint path + data: Data to send + service: 'content' or 'analysis' + headers: Optional headers override + + Returns: + JSON response as dictionary + """ + url = self._build_url(endpoint, service) + request_headers = headers or {"Content-Type": "text/plain"} + + try: + if isinstance(data, list): + data = "\n".join(data) + + response = self.session.post( + url, data=data, headers=request_headers, timeout=self.timeout + ) + response.raise_for_status() + return response.json() + + except Timeout: + logger.error("POST request timeout for %s", url) + raise + except Exception as exc: + logger.error("POST request failed for %s: %s", url, exc) + raise + + def get_by_id(self, reactome_id: str) -> Optional[Dict]: + """ + Retrieve detailed information about a Reactome entity by its stable ID. + + Reactome ID format: R-{species_code}-{number} (e.g., R-HSA-69278) + + Args: + reactome_id: Reactome stable identifier + + Returns: + Dictionary containing entity information or None if not found + """ + if not reactome_id or not isinstance(reactome_id, str): + logger.error("Invalid Reactome ID provided") + return None + + # Normalize ID format + reactome_id = reactome_id.strip().upper() + + # Validate ID format (e.g., R-HSA-69278, R-MMU-12345) + if not re.fullmatch(r"R-[A-Z]{3}-\d+", reactome_id): + logger.warning("Unexpected Reactome ID format: %s", reactome_id) + + logger.debug("Fetching Reactome entity: %s", reactome_id) + + result = self._get(f"/data/query/{reactome_id}") + if not result: + return None + + return self._normalize_entity(result) + + def _normalize_entity(self, data: Dict) -> Dict: + """ + Normalize Reactome entity data to standard format. + + Args: + data: Raw API response + + Returns: + Standardized dictionary format + """ + entity_type = data.get("schemaClass", "Unknown") + stable_id = data.get("stId", data.get("dbId", "Unknown")) + + normalized = { + "molecule_type": self._map_entity_type(entity_type), + "database": "Reactome", + "id": stable_id, + "name": data.get("displayName", "Unknown"), + "description": data.get("summation", [{}])[0].get("text", "") + if isinstance(data.get("summation"), list) + else "", + "species": data.get("speciesName", self.species), + "url": f"https://reactome.org/content/detail/{stable_id}", + "entity_type": entity_type, + "is_in_disease": data.get("isInDisease", False), + "is_inferred": data.get("isInferred", False), + } + + # Add type-specific fields + if entity_type in ["Pathway", "TopLevelPathway"]: + normalized.update( + { + "has_diagram": data.get("hasDiagram", False), + "has_disease": data.get("hasDisease", False), + "pathway_types": [ + c.get("displayName") for c in data.get("compartment", []) + ] + if data.get("compartment") + else [], + } + ) + elif entity_type in [ + "Reaction", + "BlackBoxEvent", + "Polymerisation", + "Depolymerisation", + ]: + normalized.update( + { + "reaction_type": entity_type, + "is_chimeric": data.get("isChimeric", False), + } + ) + elif entity_type in [ + "EntityWithAccessionedSequence", + "SimpleEntity", + "Complex", + "EntitySet", + ]: + normalized.update( + { + "reference_entities": [ + ref.get("dbId") for ref in data.get("referenceEntity", []) + ] + if isinstance(data.get("referenceEntity"), list) + else [], + } + ) + + # Add cross-references if available + if data.get("crossReference"): + normalized["cross_references"] = [ + { + "database": ref.get("referenceDatabase", "Unknown"), + "identifier": ref.get("identifier", "Unknown"), + } + for ref in data.get("crossReference", []) + ] + + return normalized + + def _map_entity_type(self, schema_class: str) -> str: + """Map Reactome schema classes to generic molecule types.""" + mapping = { + "Pathway": "pathway", + "TopLevelPathway": "pathway", + "Reaction": "reaction", + "BlackBoxEvent": "reaction", + "Polymerisation": "reaction", + "Depolymerisation": "reaction", + "EntityWithAccessionedSequence": "protein", + "SimpleEntity": "small_molecule", + "Complex": "complex", + "EntitySet": "entity_set", + "GenomeEncodedEntity": "genome_entity", + } + return mapping.get(schema_class, "other") + + def get_best_hit(self, keyword: str) -> Optional[Dict]: + """ + Search Reactome with a keyword and return the best (first) hit. + + Args: + keyword: Search term (gene symbol, protein name, pathway name, etc.) + + Returns: + Best matching entity or None if not found + """ + if not keyword or not isinstance(keyword, str): + return None + + keyword = keyword.strip() + if not keyword: + return None + + logger.debug("Searching Reactome for keyword: %s", keyword) + + # Use the search endpoint with clusters parameter for better results + params = { + "query": keyword, + "species": self.species_code, + "rows": 1, + "cluster": "true", + } + + result = self._get("/search/query", params=params) + if not result or not result.get("results"): + logger.info("No Reactome results found for keyword: %s", keyword) + return None + + # Get first result + best_hit = result["results"][0] + entry_id = best_hit.get("stId") + + if not entry_id: + logger.warning("Search result missing stable ID") + return None + + # Fetch full details for the best hit + return self.get_by_id(entry_id) + + def search_pathways( + self, query: str, include_disease: bool = True, limit: int = 10 + ) -> List[Dict]: + """ + Search for pathways matching the query. + + Args: + query: Search term + include_disease: Whether to include disease pathways + limit: Maximum number of results + + Returns: + List of pathway dictionaries + """ + params = { + "query": query, + "species": self.species_code, + "types": "Pathway", + "rows": limit, + "start": 0, + } + + if not include_disease: + params["compartment"] = "NOT disease" + + result = self._get("/search/query", params=params) + if not result or not result.get("results"): + return [] + + pathways = [] + for hit in result.get("results", [])[:limit]: + if hit.get("stId"): + detail = self.get_by_id(hit["stId"]) + if detail: + pathways.append(detail) + + return pathways + + def get_participating_molecules(self, event_id: str) -> List[Dict]: + """ + Get all participating physical entities in a pathway or reaction. + + Args: + event_id: Reactome pathway or reaction ID + + Returns: + List of participating molecules + """ + if not event_id: + return [] + + result = self._get(f"/data/event/{event_id}/participatingPhysicalEntities") + if not result or not isinstance(result, list): + return [] + + molecules = [] + for entity in result: + normalized = ( + self._normalize_entity(entity) + if isinstance(entity, dict) + else {"id": str(entity)} + ) + molecules.append(normalized) + + return molecules + + def analyze_genes( + self, + gene_list: Union[str, List[str]], + projection: bool = False, + interactors: bool = False, + include_disease: bool = True, + ) -> Optional[Dict]: + """ + Perform overrepresentation analysis on a list of genes/proteins. + + This maps genes to Reactome pathways and performs statistical enrichment analysis. + + Args: + gene_list: List of gene symbols, UniProt IDs, or Ensembl IDs (or newline-separated string) + projection: If True, project results to human pathways regardless of input species + interactors: If True, include interactors in the analysis + include_disease: If True, include disease pathways in results + + Returns: + Analysis results dictionary containing pathways, statistics, and token + """ + if isinstance(gene_list, list): + identifiers = gene_list + else: + identifiers = [ + line.strip() for line in gene_list.strip().split("\n") if line.strip() + ] + + if not identifiers: + logger.error("Empty gene list provided for analysis") + return None + + logger.debug("Analyzing %d genes in Reactome", len(identifiers)) + + # Build endpoint + endpoint = "/identifiers/" + params = {} + if projection: + endpoint += "projection/" + if interactors: + params["interactors"] = "true" + if include_disease: + params["includeDisease"] = "true" + + # Construct query string + query_params = ( + "&".join([f"{k}={v}" for k, v in params.items()]) if params else "" + ) + if query_params: + endpoint += f"?{query_params}" + + try: + result = self._post(endpoint, identifiers, service="analysis") + if not result: + return None + + # Normalize analysis results + return self._normalize_analysis_result(result) + + except Exception as exc: + logger.error("Gene analysis failed: %s", exc) + return None + + def _normalize_analysis_result(self, data: Dict) -> Dict: + """ + Normalize analysis service response. + + Args: + data: Raw analysis API response + + Returns: + Standardized analysis results + """ + summary = data.get("summary", {}) + pathways = data.get("pathways", []) + + normalized = { + "database": "Reactome", + "analysis_type": "overrepresentation", + "token": summary.get("token"), # Token valid for 7 days to retrieve results + "species": summary.get("speciesName", self.species), + "total_pathways": len(pathways), + "pathways": [], + } + + for pathway in pathways: + path_data = { + "id": pathway.get("stId"), + "name": pathway.get("name"), + "database": "Reactome", + "url": f"https://reactome.org/PathwayBrowser/#{pathway.get('stId')}", + "statistics": { + "p_value": pathway.get("entities", {}).get("pValue"), + "fdr": pathway.get("entities", {}).get("fdr"), + "ratio": pathway.get("entities", {}).get("ratio"), + "found_entities": pathway.get("entities", {}).get("found"), + "total_entities": pathway.get("entities", {}).get("total"), + }, + "reactions": { + "found": pathway.get("reactions", {}).get("found"), + "total": pathway.get("reactions", {}).get("total"), + }, + "is_disease": pathway.get("isDisease", False), + "is_inferred": pathway.get("isInferred", False), + } + normalized["pathways"].append(path_data) + + # Sort by FDR + normalized["pathways"].sort(key=lambda x: x["statistics"]["fdr"] or 1.0) + + return normalized + + def get_analysis_by_token(self, token: str) -> Optional[Dict]: + """ + Retrieve previous analysis results by token. + + Tokens are valid for 7 days. + + Args: + token: Analysis token from previous analyze_genes call + + Returns: + Analysis results dictionary + """ + if not token: + return None + + result = self._get(f"/token/{token}", service="analysis") + if result: + return self._normalize_analysis_result(result) + return None + + def get_pathway_browser_url( + self, pathway_id: str, token: Optional[str] = None + ) -> str: + """ + Generate URL to view pathway in Reactome Pathway Browser. + + Args: + pathway_id: Reactome pathway ID + token: Optional analysis token to overlay results + + Returns: + URL string + """ + base_url = f"https://reactome.org/PathwayBrowser/#{pathway_id}" + if token: + base_url += f"&DTAB=AN&ANALYSIS={token}" + return base_url + + @retry( + stop=stop_after_attempt(5), + wait=wait_exponential(multiplier=1, min=4, max=10), + retry=retry_if_exception_type(RequestException), + reraise=True, + ) + def search(self, query: str, **kwargs) -> Optional[Dict]: + """ + Unified search interface for Reactome. + + Auto-detects query type: + - Reactome ID (R-HSA-XXXXX): Direct lookup + - Gene/protein list (multiline or comma-separated): Enrichment analysis + - Single keyword: Best match lookup + + Args: + query: Search query (ID, keyword, or gene list) + **kwargs: Additional parameters: + - threshold: Not used for Reactome (kept for interface consistency) + - include_disease: Include disease pathways (default: True) + - projection: Project to human pathways (default: False) + + Returns: + Dictionary containing search results + """ + if not query or not isinstance(query, str): + logger.error("Empty or invalid query") + return None + + query = query.strip() + include_disease = kwargs.get("include_disease", True) + projection = kwargs.get("projection", False) + + logger.debug("Reactome search query: %s", query) + + result = None + + # Check if Reactome ID (R-HSA-69278 format) + if re.fullmatch(r"R-[A-Z]{3}-\d+", query, re.I): + result = self.get_by_id(query) + + # Check if multi-line (gene list for enrichment) + elif "\n" in query or "," in query: + # Parse gene list + genes = [g.strip() for g in re.split(r"[\n,]", query) if g.strip()] + if len(genes) > 1 or (len(genes) == 1 and len(genes[0]) < 20): + # Likely a gene list + result = self.analyze_genes( + genes, projection=projection, include_disease=include_disease + ) + else: + # Single long string, treat as keyword + result = self.get_best_hit(query) + else: + # Single keyword search + result = self.get_best_hit(query) + + if result: + result["_search_query"] = query + return result + + def __del__(self): + """Cleanup session.""" + if hasattr(self, "session"): + self.session.close() diff --git a/graphgen/operators/search/search_service.py b/graphgen/operators/search/search_service.py index 1a599e25..5ad4deb2 100644 --- a/graphgen/operators/search/search_service.py +++ b/graphgen/operators/search/search_service.py @@ -58,6 +58,11 @@ def _init_searcher(self): params = self.kwargs.get("rnacentral_params", {}) self.searcher = RNACentralSearch(**params) + elif self.data_source == "reactome": + from graphgen.models import ReactomeSearcher + + params = self.kwargs.get("reactome_params", {}) + self.searcher = ReactomeSearcher(**params) else: logger.error(f"Unknown data source: {self.data_source}") From 82b85527d47a3ef6b8d01e3d180ff2adea434f46 Mon Sep 17 00:00:00 2001 From: chenzihong-gavin Date: Mon, 9 Feb 2026 17:16:07 +0800 Subject: [PATCH 2/3] refactor: refactor reactome_searcher --- .../models/searcher/db/reactome_searcher.py | 754 +++++------------- 1 file changed, 218 insertions(+), 536 deletions(-) diff --git a/graphgen/models/searcher/db/reactome_searcher.py b/graphgen/models/searcher/db/reactome_searcher.py index 8f06b3a5..71db324a 100644 --- a/graphgen/models/searcher/db/reactome_searcher.py +++ b/graphgen/models/searcher/db/reactome_searcher.py @@ -1,620 +1,302 @@ import re -from typing import Dict, List, Optional, Union +import time +from typing import Any, Dict, Optional import requests -from requests.exceptions import RequestException, Timeout -from tenacity import ( - retry, - retry_if_exception_type, - stop_after_attempt, - wait_exponential, -) - -from graphgen.bases import BaseSearcher +from requests.exceptions import RequestException + from graphgen.utils import logger -class ReactomeSearcher(BaseSearcher): +class ReactomeSearcher: """ - Reactome Search client to query biological pathways and reactions. - - Reactome is a free, open-source, curated pathway database with 2,825+ human pathways. - It employs a reductionist data model representing biology as reactions converting - input physical entities into output physical entities. + Reactome Pathway Search client for retrieving biological pathways by UniProt ID. - Features: - 1) Get pathway/reaction/entity by Reactome stable ID (e.g., R-HSA-69278). - 2) Search with keywords to find pathways, reactions, proteins, or small molecules. - 3) Perform overrepresentation analysis on gene/protein lists to find enriched pathways. + Supports: + 1) Search pathways associated with a protein by UniProt accession. + 2) Rank pathways by relevance (curated vs inferred, diagram availability). + 3) Fetch detailed annotations for top-ranked pathways. - API Documentation: https://reactome.org/dev/content-service + API Documentation: https://reactome.org/ContentService """ - CONTENT_BASE_URL = "https://reactome.org/ContentService" - ANALYSIS_BASE_URL = "https://reactome.org/AnalysisService" - DEFAULT_SPECIES = "Homo sapiens" - SUPPORTED_SPECIES = { - "Homo sapiens": "HSA", - "Mus musculus": "MMU", - "Rattus norvegicus": "RNO", - "Gallus gallus": "GGA", - "Danio rerio": "DRE", - "Drosophila melanogaster": "DME", - "Caenorhabditis elegans": "CEL", - "Saccharomyces cerevisiae": "SCE", - } + CONTENT_URL = "https://reactome.org/ContentService" + + # UniProt accession pattern (e.g., P04637, Q96KN2, O14763) + UNIPROT_PATTERN = re.compile( + r"^[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}$" + ) def __init__( self, species: str = "Homo sapiens", timeout: int = 30, - max_retries: int = 3, + top_n_details: int = 5, ): """ - Initialize Reactome searcher. + Initialize the Reactome Pathway Search client. Args: - species: Species name (default: Homo sapiens) - timeout: Request timeout in seconds - max_retries: Maximum number of retries for failed requests + species: Species name (e.g., "Homo sapiens", "Mus musculus") or code ("HSA"). + timeout: Request timeout in seconds. + top_n_details: Number of top pathways to fetch detailed annotations for. """ - self.species = ( - species if species in self.SUPPORTED_SPECIES else self.DEFAULT_SPECIES - ) - self.species_code = self.SUPPORTED_SPECIES.get(self.species, "HSA") self.timeout = timeout - self.max_retries = max_retries + self.species = self._normalize_species(species) + self.top_n_details = top_n_details self.session = requests.Session() - self.session.headers.update( - { - "Accept": "application/json", - "Content-Type": "application/json", - } - ) - - def _build_url(self, endpoint: str, service: str = "content") -> str: - """Build full API URL.""" - base = self.CONTENT_BASE_URL if service == "content" else self.ANALYSIS_BASE_URL - return f"{base}{endpoint}" - - @retry( - stop=stop_after_attempt(3), - wait=wait_exponential(multiplier=1, min=2, max=10), - retry=retry_if_exception_type((RequestException, Timeout)), - reraise=True, - ) - def _get( - self, endpoint: str, params: Optional[Dict] = None, service: str = "content" - ) -> Optional[Dict]: - """ - Perform GET request with retry logic. + self.session.headers.update({"Accept": "application/json"}) + + @staticmethod + def _normalize_species(species: str) -> str: + """Convert species code to full name.""" + species_map = { + "HSA": "Homo sapiens", + "MMU": "Mus musculus", + "RNO": "Rattus norvegicus", + "GGA": "Gallus gallus", + "CEL": "Caenorhabditis elegans", + "DME": "Drosophila melanogaster", + } + return species_map.get(species.upper(), species) - Args: - endpoint: API endpoint path - params: Query parameters - service: 'content' or 'analysis' + @staticmethod + def _is_uniprot_accession(text: str) -> bool: + """Check if text is a valid UniProt accession number.""" + if not text or not isinstance(text, str): + return False + return bool(ReactomeSearcher.UNIPROT_PATTERN.match(text.strip())) - Returns: - JSON response as dictionary or None if not found + def _calculate_relevance_score(self, pathway: Dict[str, Any]) -> int: """ - url = self._build_url(endpoint, service) - try: - response = self.session.get(url, params=params, timeout=self.timeout) - response.raise_for_status() - - # Handle both JSON and text responses - content_type = response.headers.get("content-type", "") - if "application/json" in content_type: - return response.json() - return {"text": response.text} - - except requests.HTTPError as e: - if e.response.status_code == 404: - logger.warning("Reactome resource not found: %s", url) - return None - raise - except Timeout: - logger.error("Request timeout for %s", url) - raise - except Exception as exc: - logger.error("Request failed for %s: %s", url, exc) - raise - - @retry( - stop=stop_after_attempt(3), - wait=wait_exponential(multiplier=1, min=2, max=10), - retry=retry_if_exception_type((RequestException, Timeout)), - reraise=True, - ) - def _post( - self, - endpoint: str, - data: Union[str, List[str]], - service: str = "analysis", - headers: Optional[Dict] = None, - ) -> Optional[Dict]: + Calculate relevance score for pathway ranking. + Higher score indicates higher biological significance. + + Scoring criteria: + - Manual curation (not inferred): +10 + - Has pathway diagram: +5 + - Disease-related: +3 + - Specific biological terms in name: +2 + """ + score = 0 + + # Prioritize manually curated over computational predictions + if not pathway.get("isInferred", True): + score += 10 + + # Visual representations indicate well-characterized pathways + if pathway.get("hasDiagram", False): + score += 5 + + # Disease pathways often have higher clinical relevance + if pathway.get("isInDisease", False): + score += 3 + + # Prefer specific pathway types over generic classifications + name = pathway.get("displayName", "").lower() + specific_terms = [ + "signaling", + "regulation", + "activation", + "pathway", + "synthesis", + "degradation", + "repair", + "apoptosis", + ] + if any(term in name for term in specific_terms): + score += 2 + + return score + + def _fetch_pathway_details(self, pathway_stid: str) -> Optional[Dict[str, Any]]: """ - Perform POST request with retry logic. + Fetch detailed information for a specific pathway. Args: - endpoint: API endpoint path - data: Data to send - service: 'content' or 'analysis' - headers: Optional headers override + pathway_stid: Reactome stable ID (e.g., "R-HSA-111288"). Returns: - JSON response as dictionary + Dictionary with detailed annotations or None if fetch fails. """ - url = self._build_url(endpoint, service) - request_headers = headers or {"Content-Type": "text/plain"} + url = f"{self.CONTENT_URL}/data/query/{pathway_stid}" try: - if isinstance(data, list): - data = "\n".join(data) + response = self.session.get(url, timeout=self.timeout) + if response.status_code == 404: + logger.warning("Pathway %s not found in Reactome", pathway_stid) + return None - response = self.session.post( - url, data=data, headers=request_headers, timeout=self.timeout - ) response.raise_for_status() - return response.json() - - except Timeout: - logger.error("POST request timeout for %s", url) - raise - except Exception as exc: - logger.error("POST request failed for %s: %s", url, exc) - raise - - def get_by_id(self, reactome_id: str) -> Optional[Dict]: - """ - Retrieve detailed information about a Reactome entity by its stable ID. - - Reactome ID format: R-{species_code}-{number} (e.g., R-HSA-69278) - - Args: - reactome_id: Reactome stable identifier - - Returns: - Dictionary containing entity information or None if not found - """ - if not reactome_id or not isinstance(reactome_id, str): - logger.error("Invalid Reactome ID provided") - return None - - # Normalize ID format - reactome_id = reactome_id.strip().upper() - - # Validate ID format (e.g., R-HSA-69278, R-MMU-12345) - if not re.fullmatch(r"R-[A-Z]{3}-\d+", reactome_id): - logger.warning("Unexpected Reactome ID format: %s", reactome_id) + data = response.json() + + # Extract key annotations + details = { + "schemaClass": data.get("schemaClass"), + "summation": data.get("summation", [None])[0] + if data.get("summation") + else None, + "compartment": [ + c.get("displayName") for c in data.get("compartment", []) + ], + "disease": [d.get("displayName") for d in data.get("disease", [])], + "sub_pathways": [ + {"stId": e.get("stId"), "name": e.get("displayName")} + for e in data.get("hasEvent", [])[:5] # First 5 sub-events + ], + "literature_references": [ + { + "pubMedId": ref.get("pubMedIdentifier"), + "title": ref.get("displayName"), + } + for ref in data.get("literatureReference", [])[:3] # Top 3 refs + ], + } - logger.debug("Fetching Reactome entity: %s", reactome_id) + return details - result = self._get(f"/data/query/{reactome_id}") - if not result: + except RequestException as e: + logger.error("Failed to fetch details for pathway %s: %s", pathway_stid, e) return None - return self._normalize_entity(result) - - def _normalize_entity(self, data: Dict) -> Dict: - """ - Normalize Reactome entity data to standard format. - - Args: - data: Raw API response - - Returns: - Standardized dictionary format + def search_by_uniprot_id(self, accession: str) -> Optional[Dict]: """ - entity_type = data.get("schemaClass", "Unknown") - stable_id = data.get("stId", data.get("dbId", "Unknown")) - - normalized = { - "molecule_type": self._map_entity_type(entity_type), - "database": "Reactome", - "id": stable_id, - "name": data.get("displayName", "Unknown"), - "description": data.get("summation", [{}])[0].get("text", "") - if isinstance(data.get("summation"), list) - else "", - "species": data.get("speciesName", self.species), - "url": f"https://reactome.org/content/detail/{stable_id}", - "entity_type": entity_type, - "is_in_disease": data.get("isInDisease", False), - "is_inferred": data.get("isInferred", False), - } - - # Add type-specific fields - if entity_type in ["Pathway", "TopLevelPathway"]: - normalized.update( - { - "has_diagram": data.get("hasDiagram", False), - "has_disease": data.get("hasDisease", False), - "pathway_types": [ - c.get("displayName") for c in data.get("compartment", []) - ] - if data.get("compartment") - else [], - } - ) - elif entity_type in [ - "Reaction", - "BlackBoxEvent", - "Polymerisation", - "Depolymerisation", - ]: - normalized.update( - { - "reaction_type": entity_type, - "is_chimeric": data.get("isChimeric", False), - } - ) - elif entity_type in [ - "EntityWithAccessionedSequence", - "SimpleEntity", - "Complex", - "EntitySet", - ]: - normalized.update( - { - "reference_entities": [ - ref.get("dbId") for ref in data.get("referenceEntity", []) - ] - if isinstance(data.get("referenceEntity"), list) - else [], - } - ) - - # Add cross-references if available - if data.get("crossReference"): - normalized["cross_references"] = [ - { - "database": ref.get("referenceDatabase", "Unknown"), - "identifier": ref.get("identifier", "Unknown"), - } - for ref in data.get("crossReference", []) - ] - - return normalized - - def _map_entity_type(self, schema_class: str) -> str: - """Map Reactome schema classes to generic molecule types.""" - mapping = { - "Pathway": "pathway", - "TopLevelPathway": "pathway", - "Reaction": "reaction", - "BlackBoxEvent": "reaction", - "Polymerisation": "reaction", - "Depolymerisation": "reaction", - "EntityWithAccessionedSequence": "protein", - "SimpleEntity": "small_molecule", - "Complex": "complex", - "EntitySet": "entity_set", - "GenomeEncodedEntity": "genome_entity", - } - return mapping.get(schema_class, "other") + Search Reactome pathways by UniProt accession number. - def get_best_hit(self, keyword: str) -> Optional[Dict]: - """ - Search Reactome with a keyword and return the best (first) hit. + Retrieves all pathways associated with the protein, ranks them by relevance, + and fetches detailed annotations for the top N pathways. Args: - keyword: Search term (gene symbol, protein name, pathway name, etc.) + accession: UniProt accession number (e.g., "P04637" for TP53). Returns: - Best matching entity or None if not found + Dictionary with pathway information or None if search fails: + { + "molecule_type": "protein", + "database": "Reactome", + "id": accession, + "content": { + "total_found": int, + "pathways": List[Dict] # Top pathways with details + }, + "url": str # Link to Reactome search + } """ - if not keyword or not isinstance(keyword, str): + if not self._is_uniprot_accession(accession): + logger.error("Invalid UniProt accession format: %s", accession) return None - keyword = keyword.strip() - if not keyword: - return None - - logger.debug("Searching Reactome for keyword: %s", keyword) + accession = accession.strip().upper() + logger.debug("Searching Reactome pathways for %s", accession) - # Use the search endpoint with clusters parameter for better results + # Step 1: Search for all pathways + url = f"{self.CONTENT_URL}/search/query" params = { - "query": keyword, - "species": self.species_code, - "rows": 1, - "cluster": "true", + "query": accession, + "species": self.species, + "rows": 100, + "type": "Pathway", } - result = self._get("/search/query", params=params) - if not result or not result.get("results"): - logger.info("No Reactome results found for keyword: %s", keyword) - return None - - # Get first result - best_hit = result["results"][0] - entry_id = best_hit.get("stId") - - if not entry_id: - logger.warning("Search result missing stable ID") - return None - - # Fetch full details for the best hit - return self.get_by_id(entry_id) - - def search_pathways( - self, query: str, include_disease: bool = True, limit: int = 10 - ) -> List[Dict]: - """ - Search for pathways matching the query. - - Args: - query: Search term - include_disease: Whether to include disease pathways - limit: Maximum number of results - - Returns: - List of pathway dictionaries - """ - params = { - "query": query, - "species": self.species_code, - "types": "Pathway", - "rows": limit, - "start": 0, - } - - if not include_disease: - params["compartment"] = "NOT disease" - - result = self._get("/search/query", params=params) - if not result or not result.get("results"): - return [] - - pathways = [] - for hit in result.get("results", [])[:limit]: - if hit.get("stId"): - detail = self.get_by_id(hit["stId"]) - if detail: - pathways.append(detail) - - return pathways - - def get_participating_molecules(self, event_id: str) -> List[Dict]: - """ - Get all participating physical entities in a pathway or reaction. - - Args: - event_id: Reactome pathway or reaction ID - - Returns: - List of participating molecules - """ - if not event_id: - return [] - - result = self._get(f"/data/event/{event_id}/participatingPhysicalEntities") - if not result or not isinstance(result, list): - return [] - - molecules = [] - for entity in result: - normalized = ( - self._normalize_entity(entity) - if isinstance(entity, dict) - else {"id": str(entity)} - ) - molecules.append(normalized) - - return molecules - - def analyze_genes( - self, - gene_list: Union[str, List[str]], - projection: bool = False, - interactors: bool = False, - include_disease: bool = True, - ) -> Optional[Dict]: - """ - Perform overrepresentation analysis on a list of genes/proteins. - - This maps genes to Reactome pathways and performs statistical enrichment analysis. - - Args: - gene_list: List of gene symbols, UniProt IDs, or Ensembl IDs (or newline-separated string) - projection: If True, project results to human pathways regardless of input species - interactors: If True, include interactors in the analysis - include_disease: If True, include disease pathways in results - - Returns: - Analysis results dictionary containing pathways, statistics, and token - """ - if isinstance(gene_list, list): - identifiers = gene_list - else: - identifiers = [ - line.strip() for line in gene_list.strip().split("\n") if line.strip() - ] - - if not identifiers: - logger.error("Empty gene list provided for analysis") - return None - - logger.debug("Analyzing %d genes in Reactome", len(identifiers)) - - # Build endpoint - endpoint = "/identifiers/" - params = {} - if projection: - endpoint += "projection/" - if interactors: - params["interactors"] = "true" - if include_disease: - params["includeDisease"] = "true" - - # Construct query string - query_params = ( - "&".join([f"{k}={v}" for k, v in params.items()]) if params else "" - ) - if query_params: - endpoint += f"?{query_params}" - try: - result = self._post(endpoint, identifiers, service="analysis") - if not result: - return None - - # Normalize analysis results - return self._normalize_analysis_result(result) - - except Exception as exc: - logger.error("Gene analysis failed: %s", exc) - return None - - def _normalize_analysis_result(self, data: Dict) -> Dict: - """ - Normalize analysis service response. - - Args: - data: Raw analysis API response + response = self.session.get(url, params=params, timeout=self.timeout) + response.raise_for_status() + data = response.json() - Returns: - Standardized analysis results - """ - summary = data.get("summary", {}) - pathways = data.get("pathways", []) - - normalized = { - "database": "Reactome", - "analysis_type": "overrepresentation", - "token": summary.get("token"), # Token valid for 7 days to retrieve results - "species": summary.get("speciesName", self.species), - "total_pathways": len(pathways), - "pathways": [], - } + hits = data.get("searchHits", []) + if not hits: + logger.info("No pathways found for %s in %s", accession, self.species) + return None - for pathway in pathways: - path_data = { - "id": pathway.get("stId"), - "name": pathway.get("name"), + # Step 2: Extract basic pathway info + pathways = [] + for hit in hits: + if hit.get("type") == "Pathway": + pathways.append( + { + "stId": hit.get("stId"), + "displayName": hit.get("displayName"), + "dbId": hit.get("dbId"), + "species": hit.get("species"), + "isInDisease": hit.get("isInDisease", False), + "isInferred": hit.get("isInferred", False), + "hasDiagram": hit.get("hasDiagram", False), + "url": f"https://reactome.org/PathwayBrowser/#{hit.get('stId')}", + } + ) + + logger.info("Found %d pathways for %s", len(pathways), accession) + + # Step 3: Rank by relevance score + scored = [(self._calculate_relevance_score(pw), pw) for pw in pathways] + scored.sort(key=lambda x: x[0], reverse=True) + sorted_pathways = [pw for _, pw in scored] + + # Step 4: Fetch details for top N pathways + top_pathways = [] + for i, pw in enumerate(sorted_pathways[: self.top_n_details]): + details = self._fetch_pathway_details(pw["stId"]) + if details: + pw["details"] = details + top_pathways.append(pw) + + # Small delay to avoid overwhelming API + if i < self.top_n_details - 1: + time.sleep(0.1) + else: + # Include pathway even if details fetch fails + pw["details"] = None + top_pathways.append(pw) + + # Construct result in standard format + result = { + "molecule_type": "protein", "database": "Reactome", - "url": f"https://reactome.org/PathwayBrowser/#{pathway.get('stId')}", - "statistics": { - "p_value": pathway.get("entities", {}).get("pValue"), - "fdr": pathway.get("entities", {}).get("fdr"), - "ratio": pathway.get("entities", {}).get("ratio"), - "found_entities": pathway.get("entities", {}).get("found"), - "total_entities": pathway.get("entities", {}).get("total"), + "id": accession, + "content": { + "total_found": len(pathways), + "pathways": top_pathways, }, - "reactions": { - "found": pathway.get("reactions", {}).get("found"), - "total": pathway.get("reactions", {}).get("total"), - }, - "is_disease": pathway.get("isDisease", False), - "is_inferred": pathway.get("isInferred", False), + "url": f"https://reactome.org/content/query?q={accession}", } - normalized["pathways"].append(path_data) - - # Sort by FDR - normalized["pathways"].sort(key=lambda x: x["statistics"]["fdr"] or 1.0) - return normalized + return result - def get_analysis_by_token(self, token: str) -> Optional[Dict]: - """ - Retrieve previous analysis results by token. - - Tokens are valid for 7 days. - - Args: - token: Analysis token from previous analyze_genes call - - Returns: - Analysis results dictionary - """ - if not token: + except RequestException as e: + logger.error("Failed to search Reactome for %s: %s", accession, e) return None - result = self._get(f"/token/{token}", service="analysis") - if result: - return self._normalize_analysis_result(result) - return None - - def get_pathway_browser_url( - self, pathway_id: str, token: Optional[str] = None - ) -> str: - """ - Generate URL to view pathway in Reactome Pathway Browser. - - Args: - pathway_id: Reactome pathway ID - token: Optional analysis token to overlay results - - Returns: - URL string - """ - base_url = f"https://reactome.org/PathwayBrowser/#{pathway_id}" - if token: - base_url += f"&DTAB=AN&ANALYSIS={token}" - return base_url - - @retry( - stop=stop_after_attempt(5), - wait=wait_exponential(multiplier=1, min=4, max=10), - retry=retry_if_exception_type(RequestException), - reraise=True, - ) def search(self, query: str, **kwargs) -> Optional[Dict]: """ - Unified search interface for Reactome. + Search Reactome for pathway information. - Auto-detects query type: - - Reactome ID (R-HSA-XXXXX): Direct lookup - - Gene/protein list (multiline or comma-separated): Enrichment analysis - - Single keyword: Best match lookup + Automatically detects query type (currently supports UniProt accession only). Args: - query: Search query (ID, keyword, or gene list) - **kwargs: Additional parameters: - - threshold: Not used for Reactome (kept for interface consistency) - - include_disease: Include disease pathways (default: True) - - projection: Project to human pathways (default: False) + query: Search query (UniProt accession number). + **kwargs: Additional arguments (unused). Returns: - Dictionary containing search results + Dictionary with pathway information or None if not found. """ if not query or not isinstance(query, str): - logger.error("Empty or invalid query") + logger.error("Empty or invalid input for Reactome search") return None query = query.strip() - include_disease = kwargs.get("include_disease", True) - projection = kwargs.get("projection", False) - logger.debug("Reactome search query: %s", query) result = None - # Check if Reactome ID (R-HSA-69278 format) - if re.fullmatch(r"R-[A-Z]{3}-\d+", query, re.I): - result = self.get_by_id(query) - - # Check if multi-line (gene list for enrichment) - elif "\n" in query or "," in query: - # Parse gene list - genes = [g.strip() for g in re.split(r"[\n,]", query) if g.strip()] - if len(genes) > 1 or (len(genes) == 1 and len(genes[0]) < 20): - # Likely a gene list - result = self.analyze_genes( - genes, projection=projection, include_disease=include_disease - ) - else: - # Single long string, treat as keyword - result = self.get_best_hit(query) + if self._is_uniprot_accession(query): + logger.debug("Detected UniProt accession: %s", query) + result = self.search_by_uniprot_id(query) else: - # Single keyword search - result = self.get_best_hit(query) + logger.warning("Query %s not recognized as UniProt accession", query) + # Try anyway as it might be a non-standard format + result = self.search_by_uniprot_id(query) if result: result["_search_query"] = query - return result - def __del__(self): - """Cleanup session.""" - if hasattr(self, "session"): - self.session.close() + return result From a6e8bee30e159670b3e929a0acc067e3d88b902c Mon Sep 17 00:00:00 2001 From: chenzihong-gavin Date: Mon, 9 Feb 2026 20:00:15 +0800 Subject: [PATCH 3/3] refactor: refatcor reactome_searcher --- .../models/searcher/db/reactome_searcher.py | 91 ++++++++----------- 1 file changed, 37 insertions(+), 54 deletions(-) diff --git a/graphgen/models/searcher/db/reactome_searcher.py b/graphgen/models/searcher/db/reactome_searcher.py index 71db324a..b7d8773b 100644 --- a/graphgen/models/searcher/db/reactome_searcher.py +++ b/graphgen/models/searcher/db/reactome_searcher.py @@ -29,7 +29,6 @@ class ReactomeSearcher: def __init__( self, - species: str = "Homo sapiens", timeout: int = 30, top_n_details: int = 5, ): @@ -37,29 +36,14 @@ def __init__( Initialize the Reactome Pathway Search client. Args: - species: Species name (e.g., "Homo sapiens", "Mus musculus") or code ("HSA"). timeout: Request timeout in seconds. top_n_details: Number of top pathways to fetch detailed annotations for. """ self.timeout = timeout - self.species = self._normalize_species(species) self.top_n_details = top_n_details self.session = requests.Session() self.session.headers.update({"Accept": "application/json"}) - @staticmethod - def _normalize_species(species: str) -> str: - """Convert species code to full name.""" - species_map = { - "HSA": "Homo sapiens", - "MMU": "Mus musculus", - "RNO": "Rattus norvegicus", - "GGA": "Gallus gallus", - "CEL": "Caenorhabditis elegans", - "DME": "Drosophila melanogaster", - } - return species_map.get(species.upper(), species) - @staticmethod def _is_uniprot_accession(text: str) -> bool: """Check if text is a valid UniProt accession number.""" @@ -78,17 +62,23 @@ def _calculate_relevance_score(self, pathway: Dict[str, Any]) -> int: - Disease-related: +3 - Specific biological terms in name: +2 """ + + # TODO: complete this function + score = 0 # Prioritize manually curated over computational predictions - if not pathway.get("isInferred", True): + # Note: Mapping API may not return this, default to False + if not pathway.get("isInferred", False): score += 10 # Visual representations indicate well-characterized pathways + # Note: Mapping API may not return this, default to False if pathway.get("hasDiagram", False): score += 5 # Disease pathways often have higher clinical relevance + # Note: Mapping API may not return this, default to False if pathway.get("isInDisease", False): score += 3 @@ -163,8 +153,9 @@ def search_by_uniprot_id(self, accession: str) -> Optional[Dict]: """ Search Reactome pathways by UniProt accession number. - Retrieves all pathways associated with the protein, ranks them by relevance, - and fetches detailed annotations for the top N pathways. + Retrieves all pathways associated with the protein using the dedicated + mapping endpoint, ranks them by relevance, and fetches detailed + annotations for the top N pathways. Args: accession: UniProt accession number (e.g., "P04637" for TP53). @@ -189,45 +180,40 @@ def search_by_uniprot_id(self, accession: str) -> Optional[Dict]: accession = accession.strip().upper() logger.debug("Searching Reactome pathways for %s", accession) - # Step 1: Search for all pathways - url = f"{self.CONTENT_URL}/search/query" + # Step 1: Use the correct mapping endpoint for UniProt to pathways + url = f"{self.CONTENT_URL}/data/mapping/UniProt/{accession}/pathways" params = { - "query": accession, - "species": self.species, - "rows": 100, - "type": "Pathway", + "interactors": "false", # Exclude inferred from interactors for cleaner results } try: response = self.session.get(url, params=params, timeout=self.timeout) + + if response.status_code == 404: + logger.info("No pathways found for %s", accession) + return None + response.raise_for_status() - data = response.json() - hits = data.get("searchHits", []) - if not hits: - logger.info("No pathways found for %s in %s", accession, self.species) + # The mapping API returns a list directly, not wrapped in searchHits + pathways_data = response.json() + + if not pathways_data: + logger.info("No pathways found for %s", accession) return None - # Step 2: Extract basic pathway info + # Step 2: Use pathway data as-is pathways = [] - for hit in hits: - if hit.get("type") == "Pathway": - pathways.append( - { - "stId": hit.get("stId"), - "displayName": hit.get("displayName"), - "dbId": hit.get("dbId"), - "species": hit.get("species"), - "isInDisease": hit.get("isInDisease", False), - "isInferred": hit.get("isInferred", False), - "hasDiagram": hit.get("hasDiagram", False), - "url": f"https://reactome.org/PathwayBrowser/#{hit.get('stId')}", - } - ) + for pw in pathways_data: + if isinstance(pw, dict): + pathways.append(pw) logger.info("Found %d pathways for %s", len(pathways), accession) # Step 3: Rank by relevance score + # Note: Since mapping API doesn't return isInferred/hasDiagram/isInDisease, + # we fetch details for pathways to get accurate scores if needed, + # or use name-based heuristics. Here we rank by available info. scored = [(self._calculate_relevance_score(pw), pw) for pw in pathways] scored.sort(key=lambda x: x[0], reverse=True) sorted_pathways = [pw for _, pw in scored] @@ -238,15 +224,16 @@ def search_by_uniprot_id(self, accession: str) -> Optional[Dict]: details = self._fetch_pathway_details(pw["stId"]) if details: pw["details"] = details - top_pathways.append(pw) + # Update scoring fields if details contain them + # (Details don't have these either, but keeping structure consistent) # Small delay to avoid overwhelming API if i < self.top_n_details - 1: time.sleep(0.1) else: - # Include pathway even if details fetch fails pw["details"] = None - top_pathways.append(pw) + + top_pathways.append(pw) # Construct result in standard format result = { @@ -270,8 +257,6 @@ def search(self, query: str, **kwargs) -> Optional[Dict]: """ Search Reactome for pathway information. - Automatically detects query type (currently supports UniProt accession only). - Args: query: Search query (UniProt accession number). **kwargs: Additional arguments (unused). @@ -286,15 +271,13 @@ def search(self, query: str, **kwargs) -> Optional[Dict]: query = query.strip() logger.debug("Reactome search query: %s", query) - result = None - if self._is_uniprot_accession(query): logger.debug("Detected UniProt accession: %s", query) result = self.search_by_uniprot_id(query) else: - logger.warning("Query %s not recognized as UniProt accession", query) - # Try anyway as it might be a non-standard format - result = self.search_by_uniprot_id(query) + raise ValueError( + "ReactomeSearcher only supports UniProt accession numbers as queries." + ) if result: result["_search_query"] = query